From 2b4cf67cd12672c0c8bca04549f0c65400ec6d0d Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 21 Apr 2022 03:10:55 -0700 Subject: [PATCH 01/64] Remove single_precision_histogram --- doc/gpu/index.rst | 4 +- doc/parameter.rst | 2 +- include/xgboost/tree_updater.h | 6 +- src/tree/tree_updater.cc | 3 +- src/tree/updater_approx.cc | 9 +- src/tree/updater_basemaker-inl.h | 7 +- src/tree/updater_colmaker.cc | 7 +- src/tree/updater_gpu_hist.cu | 162 +++++++--------------- src/tree/updater_histmaker.cc | 20 ++- src/tree/updater_prune.cc | 10 +- src/tree/updater_quantile_hist.cc | 4 +- src/tree/updater_quantile_hist.h | 3 +- src/tree/updater_refresh.cc | 13 +- src/tree/updater_sync.cc | 9 +- tests/cpp/tree/test_gpu_hist.cu | 16 +-- tests/python-gpu/test_gpu_basic_models.py | 4 +- tests/python-gpu/test_gpu_updaters.py | 3 +- 17 files changed, 107 insertions(+), 175 deletions(-) diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst index e36fc72a1746..049cf311dff2 100644 --- a/doc/gpu/index.rst +++ b/doc/gpu/index.rst @@ -59,13 +59,11 @@ Supported parameters +--------------------------------+--------------+ | ``interaction_constraints`` | |tick| | +--------------------------------+--------------+ -| ``single_precision_histogram`` | |tick| | +| ``single_precision_histogram`` | |cross| | +--------------------------------+--------------+ GPU accelerated prediction is enabled by default for the above mentioned ``tree_method`` parameters but can be switched to CPU prediction by setting ``predictor`` to ``cpu_predictor``. This could be useful if you want to conserve GPU memory. Likewise when using CPU algorithms, GPU accelerated prediction can be enabled by setting ``predictor`` to ``gpu_predictor``. -The experimental parameter ``single_precision_histogram`` can be set to True to enable building histograms using single precision. This may improve speed, in particular on older architectures. - The device ordinal (which GPU to use if you have many of them) can be selected using the ``gpu_id`` parameter, which defaults to 0 (the first device reported by CUDA runtime). diff --git a/doc/parameter.rst b/doc/parameter.rst index 781150490082..4392b5bf7680 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -240,7 +240,7 @@ Additional parameters for ``hist``, ``gpu_hist`` and ``approx`` tree method * ``single_precision_histogram``, [default= ``false``] - - Use single precision to build histograms instead of double precision. + - Use single precision to build histograms instead of double precision. Currently disabled for ``gpu_hist``. * ``max_cat_to_onehot`` diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h index 6189221dc0bf..6248a65e270d 100644 --- a/include/xgboost/tree_updater.h +++ b/include/xgboost/tree_updater.h @@ -35,6 +35,7 @@ class TreeUpdater : public Configurable { GenericParameter const* ctx_ = nullptr; public: + explicit TreeUpdater(const GenericParameter* ctx) : ctx_(ctx) {} /*! \brief virtual destructor */ ~TreeUpdater() override = default; /*! @@ -91,8 +92,9 @@ class TreeUpdater : public Configurable { * \brief Registry entry for tree updater. */ struct TreeUpdaterReg - : public dmlc::FunctionRegEntryBase > {}; + : public dmlc::FunctionRegEntryBase< + TreeUpdaterReg, + std::function > {}; /*! * \brief Macro to register tree updater. diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc index 05f6c4bb5fd6..ee5659636305 100644 --- a/src/tree/tree_updater.cc +++ b/src/tree/tree_updater.cc @@ -20,8 +20,7 @@ TreeUpdater* TreeUpdater::Create(const std::string& name, GenericParameter const if (e == nullptr) { LOG(FATAL) << "Unknown tree updater " << name; } - auto p_updater = (e->body)(task); - p_updater->ctx_ = tparam; + auto p_updater = (e->body)(tparam, task); return p_updater; } diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index 3bad6f7da4cc..a06f195374b6 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -252,7 +252,10 @@ class GlobalApproxUpdater : public TreeUpdater { ObjInfo task_; public: - explicit GlobalApproxUpdater(ObjInfo task) : task_{task} { monitor_.Init(__func__); } + explicit GlobalApproxUpdater(GenericParameter const *ctx, ObjInfo task) + : task_{task}, TreeUpdater(ctx) { + monitor_.Init(__func__); + } void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); @@ -343,6 +346,8 @@ XGBOOST_REGISTER_TREE_UPDATER(GlobalHistMaker, "grow_histmaker") .describe( "Tree constructor that uses approximate histogram construction " "for each node.") - .set_body([](ObjInfo task) { return new GlobalApproxUpdater(task); }); + .set_body([](GenericParameter const *ctx, ObjInfo task) { + return new GlobalApproxUpdater(ctx, task); + }); } // namespace tree } // namespace xgboost diff --git a/src/tree/updater_basemaker-inl.h b/src/tree/updater_basemaker-inl.h index da239b2090c7..7fc44a6d15fb 100644 --- a/src/tree/updater_basemaker-inl.h +++ b/src/tree/updater_basemaker-inl.h @@ -33,11 +33,10 @@ namespace tree { * \brief base tree maker class that defines common operation * needed in tree making */ -class BaseMaker: public TreeUpdater { +class BaseMaker : public TreeUpdater { public: - void Configure(const Args& args) override { - param_.UpdateAllowUnknown(args); - } + explicit BaseMaker(GenericParameter const *ctx) : TreeUpdater(ctx) {} + void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); } void LoadConfig(Json const& in) override { auto const& config = get(in); diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index e3d716f2cba8..f4279a0a1c3b 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -57,7 +57,8 @@ DMLC_REGISTER_PARAMETER(ColMakerTrainParam); /*! \brief column-wise update to construct a tree */ class ColMaker: public TreeUpdater { public: - void Configure(const Args& args) override { + explicit ColMaker(GenericParameter const *ctx) : TreeUpdater(ctx) {} + void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); colmaker_param_.UpdateAllowUnknown(args); } @@ -614,8 +615,8 @@ class ColMaker: public TreeUpdater { XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker") .describe("Grow tree with parallelization over columns.") -.set_body([](ObjInfo) { - return new ColMaker(); +.set_body([](GenericParameter const* ctx, ObjInfo) { + return new ColMaker(ctx); }); } // namespace tree } // namespace xgboost diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index cb7dd9b7e8e4..2cac6b6c4f4a 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -45,12 +45,9 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist); // training parameters specific to this algorithm struct GPUHistMakerTrainParam : public XGBoostParameter { - bool single_precision_histogram; bool debug_synchronize; // declare parameters DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) { - DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe( - "Use single precision to build histograms."); DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe( "Check if all distributed tree are identical after tree construction."); } @@ -532,6 +529,13 @@ struct GPUHistMakerDevice { void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) { RegTree& tree = *p_tree; + + // Sanity check - have we created a leaf with no training instances? + if (!rabit::IsDistributed()) { + CHECK(row_partitioner->GetRows(candidate.nid).size() > 0) + << "No training instances in this leaf!"; + } + auto parent_sum = candidate.split.left_sum + candidate.split.right_sum; auto base_weight = candidate.base_weight; auto left_weight = candidate.left_weight * param.learning_rate; @@ -676,20 +680,35 @@ struct GPUHistMakerDevice { } }; -template -class GPUHistMakerSpecialised { +class GPUHistMaker : public TreeUpdater { + using GradientSumT = GradientPairPrecise; + public: - explicit GPUHistMakerSpecialised(ObjInfo task) : task_{task} {}; - void Configure(const Args& args, GenericParameter const* generic_param) { + explicit GPUHistMaker(GenericParameter const* ctx, ObjInfo task) + : TreeUpdater(ctx), task_{task} {}; + void Configure(const Args& args) { + // Used in test to count how many configurations are performed + LOG(DEBUG) << "[GPU Hist]: Configure"; param_.UpdateAllowUnknown(args); - generic_param_ = generic_param; hist_maker_param_.UpdateAllowUnknown(args); dh::CheckComputeCapability(); monitor_.Init("updater_gpu_hist"); } - ~GPUHistMakerSpecialised() { // NOLINT + void LoadConfig(Json const& in) override { + auto const& config = get(in); + FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_); + initialised_ = false; + FromJson(config.at("train_param"), ¶m_); + } + void SaveConfig(Json* p_out) const override { + auto& out = *p_out; + out["gpu_hist_train_param"] = ToJson(hist_maker_param_); + out["train_param"] = ToJson(param_); + } + + ~GPUHistMaker() { // NOLINT dh::GlobalMemoryLogger().Log(); } @@ -719,30 +738,24 @@ class GPUHistMakerSpecialised { } void InitDataOnce(DMatrix* dmat) { - device_ = generic_param_->gpu_id; - CHECK_GE(device_, 0) << "Must have at least one device"; + CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device"; info_ = &dmat->Info(); - reducer_.Init({device_}); // NOLINT + reducer_.Init({ctx_->gpu_id}); // NOLINT // Synchronise the column sampling seed uint32_t column_sampling_seed = common::GlobalRandom()(); rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0); BatchParam batch_param{ - device_, - param_.max_bin, + ctx_->gpu_id, + param_.max_bin, }; auto page = (*dmat->GetBatches(batch_param).begin()).Impl(); - dh::safe_cuda(cudaSetDevice(device_)); - info_->feature_types.SetDevice(device_); - maker.reset(new GPUHistMakerDevice(device_, - page, - info_->feature_types.ConstDeviceSpan(), - info_->num_row_, - param_, - column_sampling_seed, - info_->num_col_, - batch_param)); + dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); + info_->feature_types.SetDevice(ctx_->gpu_id); + maker.reset(new GPUHistMakerDevice( + ctx_->gpu_id, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, param_, + column_sampling_seed, info_->num_col_, batch_param)); p_last_fmat_ = dmat; initialised_ = true; @@ -766,7 +779,7 @@ class GPUHistMakerSpecialised { } fs.Seek(0); rabit::Broadcast(&s_model, 0); - RegTree reference_tree {}; // rank 0 tree + RegTree reference_tree{}; // rank 0 tree reference_tree.Load(&fs); CHECK(*local_tree == reference_tree); } @@ -775,13 +788,11 @@ class GPUHistMakerSpecialised { monitor_.Start("InitData"); this->InitData(p_fmat); monitor_.Stop("InitData"); - - gpair->SetDevice(device_); + gpair->SetDevice(ctx_->gpu_id); maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_); } - bool UpdatePredictionCache(const DMatrix *data, - linalg::VectorView p_out_preds) { + bool UpdatePredictionCache(const DMatrix* data, linalg::VectorView p_out_preds) { if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) { return false; } @@ -791,107 +802,32 @@ class GPUHistMakerSpecialised { return true; } - TrainParam param_; // NOLINT - MetaInfo* info_{}; // NOLINT + TrainParam param_; // NOLINT + MetaInfo* info_{}; // NOLINT std::unique_ptr> maker; // NOLINT + char const* Name() const override { return "grow_gpu_hist"; } + private: - bool initialised_ { false }; + bool initialised_{false}; GPUHistMakerTrainParam hist_maker_param_; - GenericParameter const* generic_param_; dh::AllReducer reducer_; - DMatrix* p_last_fmat_ { nullptr }; - int device_{-1}; + DMatrix* p_last_fmat_{nullptr}; ObjInfo task_; common::Monitor monitor_; }; -class GPUHistMaker : public TreeUpdater { - public: - explicit GPUHistMaker(ObjInfo task) : task_{task} {} - void Configure(const Args& args) override { - // Used in test to count how many configurations are performed - LOG(DEBUG) << "[GPU Hist]: Configure"; - hist_maker_param_.UpdateAllowUnknown(args); - // The passed in args can be empty, if we simply purge the old maker without - // preserving parameters then we can't do Update on it. - TrainParam param; - if (float_maker_) { - param = float_maker_->param_; - } else if (double_maker_) { - param = double_maker_->param_; - } - if (hist_maker_param_.single_precision_histogram) { - float_maker_.reset(new GPUHistMakerSpecialised(task_)); - float_maker_->param_ = param; - float_maker_->Configure(args, ctx_); - } else { - double_maker_.reset(new GPUHistMakerSpecialised(task_)); - double_maker_->param_ = param; - double_maker_->Configure(args, ctx_); - } - } - - void LoadConfig(Json const& in) override { - auto const& config = get(in); - FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_); - if (hist_maker_param_.single_precision_histogram) { - float_maker_.reset(new GPUHistMakerSpecialised(task_)); - FromJson(config.at("train_param"), &float_maker_->param_); - } else { - double_maker_.reset(new GPUHistMakerSpecialised(task_)); - FromJson(config.at("train_param"), &double_maker_->param_); - } - } - void SaveConfig(Json* p_out) const override { - auto& out = *p_out; - out["gpu_hist_train_param"] = ToJson(hist_maker_param_); - if (hist_maker_param_.single_precision_histogram) { - out["train_param"] = ToJson(float_maker_->param_); - } else { - out["train_param"] = ToJson(double_maker_->param_); - } - } - - void Update(HostDeviceVector* gpair, DMatrix* dmat, - const std::vector& trees) override { - if (hist_maker_param_.single_precision_histogram) { - float_maker_->Update(gpair, dmat, trees); - } else { - double_maker_->Update(gpair, dmat, trees); - } - } - - bool - UpdatePredictionCache(const DMatrix *data, - linalg::VectorView p_out_preds) override { - if (hist_maker_param_.single_precision_histogram) { - return float_maker_->UpdatePredictionCache(data, p_out_preds); - } else { - return double_maker_->UpdatePredictionCache(data, p_out_preds); - } - } - - char const* Name() const override { - return "grow_gpu_hist"; - } - - private: - GPUHistMakerTrainParam hist_maker_param_; - ObjInfo task_; - std::unique_ptr> float_maker_; - std::unique_ptr> double_maker_; -}; - #if !defined(GTEST_TEST) XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist") .describe("Grow tree with GPU.") - .set_body([](ObjInfo task) { return new GPUHistMaker(task); }); + .set_body([](GenericParameter const* tparam, ObjInfo task) { + return new GPUHistMaker(tparam, task); + }); #endif // !defined(GTEST_TEST) } // namespace tree diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc index 0a85d2d73832..9d36e4d16c0a 100644 --- a/src/tree/updater_histmaker.cc +++ b/src/tree/updater_histmaker.cc @@ -24,9 +24,9 @@ DMLC_REGISTRY_FILE_TAG(updater_histmaker); class HistMaker: public BaseMaker { public: - void Update(HostDeviceVector *gpair, - DMatrix *p_fmat, - const std::vector &trees) override { + explicit HistMaker(GenericParameter const *ctx) : BaseMaker(ctx) {} + void Update(HostDeviceVector *gpair, DMatrix *p_fmat, + const std::vector &trees) override { interaction_constraints_.Configure(param_, p_fmat->Info().num_col_); // rescale learning rate according to size of trees float lr = param_.learning_rate; @@ -262,12 +262,10 @@ class HistMaker: public BaseMaker { } }; -class CQHistMaker: public HistMaker { +class CQHistMaker : public HistMaker { public: - CQHistMaker() = default; - char const* Name() const override { - return "grow_local_histmaker"; - } + explicit CQHistMaker(GenericParameter const *ctx) : HistMaker(ctx) {} + char const *Name() const override { return "grow_local_histmaker"; } protected: struct HistEntry { @@ -624,9 +622,7 @@ class CQHistMaker: public HistMaker { }; XGBOOST_REGISTER_TREE_UPDATER(LocalHistMaker, "grow_local_histmaker") -.describe("Tree constructor that uses approximate histogram construction.") -.set_body([](ObjInfo) { - return new CQHistMaker(); - }); + .describe("Tree constructor that uses approximate histogram construction.") + .set_body([](GenericParameter const *ctx, ObjInfo) { return new CQHistMaker(ctx); }); } // namespace tree } // namespace xgboost diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc index f71f1c698cb9..9e6fad883040 100644 --- a/src/tree/updater_prune.cc +++ b/src/tree/updater_prune.cc @@ -21,9 +21,9 @@ namespace tree { DMLC_REGISTRY_FILE_TAG(updater_prune); /*! \brief pruner that prunes a tree after growing finishes */ -class TreePruner: public TreeUpdater { +class TreePruner : public TreeUpdater { public: - explicit TreePruner(ObjInfo task) { + explicit TreePruner(GenericParameter const* ctx, ObjInfo task) : TreeUpdater(ctx) { syncher_.reset(TreeUpdater::Create("sync", ctx_, task)); pruner_monitor_.Init("TreePruner"); } @@ -112,9 +112,7 @@ class TreePruner: public TreeUpdater { }; XGBOOST_REGISTER_TREE_UPDATER(TreePruner, "prune") -.describe("Pruner that prune the tree according to statistics.") -.set_body([](ObjInfo task) { - return new TreePruner(task); - }); + .describe("Pruner that prune the tree according to statistics.") + .set_body([](GenericParameter const* ctx, ObjInfo task) { return new TreePruner(ctx, task); }); } // namespace tree } // namespace xgboost diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 0e1b6db47691..dcbb3dbfba3e 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -390,6 +390,8 @@ template struct QuantileHistMaker::Builder; XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker") .describe("Grow tree using quantized histogram.") - .set_body([](ObjInfo task) { return new QuantileHistMaker(task); }); + .set_body([](GenericParameter const *ctx, ObjInfo task) { + return new QuantileHistMaker(ctx, task); + }); } // namespace tree } // namespace xgboost diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 3c03a371ebfb..463c7a54ab39 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -225,7 +225,8 @@ inline BatchParam HistBatch(TrainParam const& param) { /*! \brief construct a tree using quantized feature values */ class QuantileHistMaker: public TreeUpdater { public: - explicit QuantileHistMaker(ObjInfo task) : task_{task} {} + explicit QuantileHistMaker(GenericParameter const* ctx, ObjInfo task) + : task_{task}, TreeUpdater(ctx) {} void Configure(const Args& args) override; void Update(HostDeviceVector* gpair, diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc index d17c1e1444f7..6110e964f891 100644 --- a/src/tree/updater_refresh.cc +++ b/src/tree/updater_refresh.cc @@ -22,11 +22,10 @@ namespace tree { DMLC_REGISTRY_FILE_TAG(updater_refresh); /*! \brief pruner that prunes a tree after growing finishs */ -class TreeRefresher: public TreeUpdater { +class TreeRefresher : public TreeUpdater { public: - void Configure(const Args& args) override { - param_.UpdateAllowUnknown(args); - } + explicit TreeRefresher(GenericParameter const *ctx) : TreeUpdater(ctx) {} + void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); } void LoadConfig(Json const& in) override { auto const& config = get(in); FromJson(config.at("train_param"), &this->param_); @@ -160,9 +159,7 @@ class TreeRefresher: public TreeUpdater { }; XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh") -.describe("Refresher that refreshes the weight and statistics according to data.") -.set_body([](ObjInfo) { - return new TreeRefresher(); - }); + .describe("Refresher that refreshes the weight and statistics according to data.") + .set_body([](GenericParameter const *ctx, ObjInfo) { return new TreeRefresher(ctx); }); } // namespace tree } // namespace xgboost diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc index 4f7c7a1a85a6..5a22675965dc 100644 --- a/src/tree/updater_sync.cc +++ b/src/tree/updater_sync.cc @@ -20,8 +20,9 @@ DMLC_REGISTRY_FILE_TAG(updater_sync); * \brief syncher that synchronize the tree in all distributed nodes * can implement various strategies, so far it is always set to node 0's tree */ -class TreeSyncher: public TreeUpdater { +class TreeSyncher : public TreeUpdater { public: + explicit TreeSyncher(GenericParameter const* tparam) : TreeUpdater(tparam) {} void Configure(const Args&) override {} void LoadConfig(Json const&) override {} @@ -52,9 +53,7 @@ class TreeSyncher: public TreeUpdater { }; XGBOOST_REGISTER_TREE_UPDATER(TreeSyncher, "sync") -.describe("Syncher that synchronize the tree in all distributed nodes.") -.set_body([](ObjInfo) { - return new TreeSyncher(); - }); + .describe("Syncher that synchronize the tree in all distributed nodes.") + .set_body([](GenericParameter const* tparam, ObjInfo) { return new TreeSyncher(tparam); }); } // namespace tree } // namespace xgboost diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 82f40465deb2..883537863307 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -275,8 +275,10 @@ void TestHistogramIndexImpl() { int constexpr kNRows = 1000, kNCols = 10; // Build 2 matrices and build a histogram maker with that - tree::GPUHistMakerSpecialised hist_maker{ObjInfo{ObjInfo::kRegression}}, - hist_maker_ext{ObjInfo{ObjInfo::kRegression}}; + + GenericParameter generic_param(CreateEmptyGenericParam(0)); + tree::GPUHistMaker hist_maker{&generic_param,ObjInfo{ObjInfo::kRegression}}, + hist_maker_ext{&generic_param,ObjInfo{ObjInfo::kRegression}}; std::unique_ptr hist_maker_dmat( CreateSparsePageDMatrixWithRC(kNRows, kNCols, 0, true)); @@ -289,10 +291,9 @@ void TestHistogramIndexImpl() { {"max_leaves", "0"} }; - GenericParameter generic_param(CreateEmptyGenericParam(0)); - hist_maker.Configure(training_params, &generic_param); + hist_maker.Configure(training_params); hist_maker.InitDataOnce(hist_maker_dmat.get()); - hist_maker_ext.Configure(training_params, &generic_param); + hist_maker_ext.Configure(training_params); hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get()); // Extract the device maker from the histogram makers and from that its compressed @@ -344,10 +345,9 @@ void UpdateTree(HostDeviceVector* gpair, DMatrix* dmat, {"sampling_method", sampling_method}, }; - tree::GPUHistMakerSpecialised hist_maker{ObjInfo{ObjInfo::kRegression}}; GenericParameter generic_param(CreateEmptyGenericParam(0)); - hist_maker.Configure(args, &generic_param); - + tree::GPUHistMaker hist_maker{&generic_param,ObjInfo{ObjInfo::kRegression}}; + hist_maker.Configure(args); hist_maker.Update(gpair, dmat, {tree}); auto cache = linalg::VectorView{preds->DeviceSpan(), {preds->Size()}, 0}; hist_maker.UpdatePredictionCache(dmat, cache); diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py index 06e63bdd56d9..9e955eac2931 100644 --- a/tests/python-gpu/test_gpu_basic_models.py +++ b/tests/python-gpu/test_gpu_basic_models.py @@ -16,11 +16,11 @@ class TestGPUBasicModels: cpu_test_bm = test_bm.TestModels() def run_cls(self, X, y): - cls = xgb.XGBClassifier(tree_method='gpu_hist', single_precision_histogram=True) + cls = xgb.XGBClassifier(tree_method='gpu_hist') cls.fit(X, y) cls.get_booster().save_model('test_deterministic_gpu_hist-0.json') - cls = xgb.XGBClassifier(tree_method='gpu_hist', single_precision_histogram=True) + cls = xgb.XGBClassifier(tree_method='gpu_hist') cls.fit(X, y) cls.get_booster().save_model('test_deterministic_gpu_hist-1.json') diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index a3427b566360..8f3cbcaac61f 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -3,7 +3,7 @@ import gc import pytest import xgboost as xgb -from hypothesis import given, strategies, assume, settings, note +from hypothesis import given, strategies, assume, settings, note, reproduce_failure sys.path.append("tests/python") import testing as tm @@ -15,7 +15,6 @@ 'max_leaves': strategies.integers(0, 256), 'max_bin': strategies.integers(2, 1024), 'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']), - 'single_precision_histogram': strategies.booleans(), 'min_child_weight': strategies.floats(0.5, 2.0), 'seed': strategies.integers(0, 10), # We cannot enable subsampling as the training loss can increase From f140ebcb2f0219486ad8702eaa322e0f9da624ea Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 25 Apr 2022 04:27:15 -0700 Subject: [PATCH 02/64] Batch nodes from driver --- src/tree/driver.h | 33 +++++++++--- src/tree/updater_approx.cc | 2 +- src/tree/updater_gpu_hist.cu | 69 +++++++++++++------------- src/tree/updater_quantile_hist.cc | 2 +- tests/cpp/tree/gpu_hist/test_driver.cu | 33 +++++++----- 5 files changed, 84 insertions(+), 55 deletions(-) diff --git a/src/tree/driver.h b/src/tree/driver.h index abb8afadcb8a..1e40cc32622f 100644 --- a/src/tree/driver.h +++ b/src/tree/driver.h @@ -33,9 +33,9 @@ class Driver { std::function>; public: - explicit Driver(TrainParam::TreeGrowPolicy policy) - : policy_(policy), - queue_(policy == TrainParam::kDepthWise ? DepthWise : + explicit Driver(TrainParam param) + : param_(param), + queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise : LossGuide) {} template void Push(EntryIterT begin, EntryIterT end) { @@ -55,16 +55,30 @@ class Driver { return queue_.empty(); } + // Can a child of this entry still be expanded? + // can be used to avoid extra work + bool IsChildValid(ExpandEntryT const& parent_entry){ + if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false; + if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false; + return true; + } + // Return the set of nodes to be expanded // This set has no dependencies between entries so they may be expanded in // parallel or asynchronously std::vector Pop() { if (queue_.empty()) return {}; // Return a single entry for loss guided mode - if (policy_ == TrainParam::kLossGuide) { + if (param_.grow_policy == TrainParam::kLossGuide) { ExpandEntryT e = queue_.top(); queue_.pop(); - return {e}; + + if (e.IsValid(param_, num_leaves_)) { + num_leaves_++; + return {e}; + } else { + return {}; + } } // Return nodes on same level for depth wise std::vector result; @@ -72,7 +86,11 @@ class Driver { int level = e.depth; while (e.depth == level && !queue_.empty()) { queue_.pop(); - result.emplace_back(e); + if (e.IsValid(param_, num_leaves_)) { + num_leaves_++; + result.emplace_back(e); + } + if (!queue_.empty()) { e = queue_.top(); } @@ -81,7 +99,8 @@ class Driver { } private: - TrainParam::TreeGrowPolicy policy_; + TrainParam param_; + std::size_t num_leaves_=1; ExpandQueue queue_; }; } // namespace tree diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index a06f195374b6..1c6b195ab34b 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -169,7 +169,7 @@ class GloablApproxBuilder { p_last_tree_ = p_tree; this->InitData(p_fmat, hess); - Driver driver(static_cast(param_.grow_policy)); + Driver driver(param_); auto &tree = *p_tree; driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)}); bst_node_t num_leaves{1}; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 2cac6b6c4f4a..2340687983a8 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -531,7 +531,7 @@ struct GPUHistMakerDevice { RegTree& tree = *p_tree; // Sanity check - have we created a leaf with no training instances? - if (!rabit::IsDistributed()) { + if (!rabit::IsDistributed() && row_partitioner) { CHECK(row_partitioner->GetRows(candidate.nid).size() > 0) << "No training instances in this leaf!"; } @@ -616,7 +616,7 @@ struct GPUHistMakerDevice { void UpdateTree(HostDeviceVector* gpair_all, DMatrix* p_fmat, ObjInfo task, RegTree* p_tree, dh::AllReducer* reducer) { auto& tree = *p_tree; - Driver driver(static_cast(param.grow_policy)); + Driver driver(param); monitor.Start("Reset"); this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_, task); @@ -626,48 +626,49 @@ struct GPUHistMakerDevice { driver.Push({ this->InitRoot(p_tree, task, reducer) }); monitor.Stop("InitRoot"); - auto num_leaves = 1; - // The set of leaves that can be expanded asynchronously auto expand_set = driver.Pop(); while (!expand_set.empty()) { + for(auto & candidate: expand_set){ + this->ApplySplit(candidate, p_tree); + } + // Get the candidates we are allowed to expand further + // e.g. We do not bother further processing nodes whose children are beyond max depth + std::vector filtered_expand_set; + std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set), + [&](const auto& e) { return driver.IsChildValid(e); }); + auto new_candidates = - pinned.GetSpan(expand_set.size() * 2, GPUExpandEntry()); + pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); + + for(const auto &e:filtered_expand_set){ + monitor.Start("UpdatePosition"); + // Update position is only run when child is valid, instead of right after apply + // split (as in approx tree method). Hense we have the finalise position call + // in GPU Hist. + this->UpdatePosition(e.nid, p_tree); + monitor.Stop("UpdatePosition"); + } - for (auto i = 0ull; i < expand_set.size(); i++) { + for (auto i = 0ull; i < filtered_expand_set.size(); i++) { auto candidate = expand_set.at(i); - if (!candidate.IsValid(param, num_leaves)) { - continue; - } - this->ApplySplit(candidate, p_tree); + int left_child_nidx = tree[candidate.nid].LeftChild(); + int right_child_nidx = tree[candidate.nid].RightChild(); - num_leaves++; + monitor.Start("BuildHist"); + this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); + monitor.Stop("BuildHist"); + } + for (auto i = 0ull; i < filtered_expand_set.size(); i++) { + auto candidate = expand_set.at(i); int left_child_nidx = tree[candidate.nid].LeftChild(); int right_child_nidx = tree[candidate.nid].RightChild(); - // Only create child entries if needed - if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx), - num_leaves)) { - monitor.Start("UpdatePosition"); - // Update position is only run when child is valid, instead of right after apply - // split (as in approx tree method). Hense we have the finalise position call - // in GPU Hist. - this->UpdatePosition(candidate.nid, p_tree); - monitor.Stop("UpdatePosition"); - - monitor.Start("BuildHist"); - this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); - monitor.Stop("BuildHist"); - - monitor.Start("EvaluateSplits"); - this->EvaluateLeftRightSplits(candidate, task, left_child_nidx, right_child_nidx, *p_tree, - new_candidates.subspan(i * 2, 2)); - monitor.Stop("EvaluateSplits"); - } else { - // Set default - new_candidates[i * 2] = GPUExpandEntry(); - new_candidates[i * 2 + 1] = GPUExpandEntry(); - } + + monitor.Start("EvaluateSplits"); + this->EvaluateLeftRightSplits(candidate, task, left_child_nidx, right_child_nidx, *p_tree, + new_candidates.subspan(i * 2, 2)); + monitor.Stop("EvaluateSplits"); } dh::DefaultStream().Sync(); driver.Push(new_candidates.begin(), new_candidates.end()); diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index dcbb3dbfba3e..bdda543d75a7 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -174,7 +174,7 @@ void QuantileHistMaker::Builder::ExpandTree( DMatrix *p_fmat, RegTree *p_tree, const std::vector &gpair_h) { monitor_->Start(__func__); - Driver driver(static_cast(param_.grow_policy)); + Driver driver(param_); driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h)); bst_node_t num_leaves{1}; auto expand_set = driver.Pop(); diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu index d35f3510f628..d7f8cc63869e 100644 --- a/tests/cpp/tree/gpu_hist/test_driver.cu +++ b/tests/cpp/tree/gpu_hist/test_driver.cu @@ -6,16 +6,21 @@ namespace xgboost { namespace tree { TEST(GpuHist, DriverDepthWise) { - Driver driver(TrainParam::kDepthWise); + TrainParam p; + p.InitAllowUnknown(Args{}); + p.grow_policy=TrainParam::kDepthWise; + Driver driver(p); EXPECT_TRUE(driver.Pop().empty()); DeviceSplitCandidate split; split.loss_chg = 1.0f; - GPUExpandEntry root(0, 0, split, .0f, .0f, .0f); + split.left_sum = {0.0f, 1.0f}; + split.right_sum = {0.0f, 1.0f}; + GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f); driver.Push({root}); EXPECT_EQ(driver.Pop().front().nid, 0); - driver.Push({GPUExpandEntry{1, 1, split, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{2, 1, split, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{3, 2, split, .0f, .0f, .0f}}); + driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{3, 2, split, 2.0f, 1.0f, 1.0f}}); // Should return entries from level 1 auto res = driver.Pop(); EXPECT_EQ(res.size(), 2); @@ -29,18 +34,22 @@ TEST(GpuHist, DriverDepthWise) { TEST(GpuHist, DriverLossGuided) { DeviceSplitCandidate high_gain; + high_gain.left_sum = {0.0f, 1.0f}; + high_gain.right_sum = {0.0f, 1.0f}; high_gain.loss_chg = 5.0f; - DeviceSplitCandidate low_gain; + DeviceSplitCandidate low_gain = high_gain; low_gain.loss_chg = 1.0f; - Driver driver(TrainParam::kLossGuide); + TrainParam p; + p.grow_policy=TrainParam::kLossGuide; + Driver driver(p); EXPECT_TRUE(driver.Pop().empty()); - GPUExpandEntry root(0, 0, high_gain, .0f, .0f, .0f); + GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f ); driver.Push({root}); EXPECT_EQ(driver.Pop().front().nid, 0); // Select high gain first - driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{2, 2, high_gain, .0f, .0f, .0f}}); + driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}}); auto res = driver.Pop(); EXPECT_EQ(res.size(), 1); EXPECT_EQ(res[0].nid, 2); @@ -49,8 +58,8 @@ TEST(GpuHist, DriverLossGuided) { EXPECT_EQ(res[0].nid, 1); // If equal gain, use nid - driver.Push({GPUExpandEntry{2, 1, low_gain, .0f, .0f, .0f}}); - driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}}); + driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}}); res = driver.Pop(); EXPECT_EQ(res[0].nid, 1); res = driver.Pop(); From 80a3e78f9e1dcbf2a78f6572897453c61afd60b0 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 29 Apr 2022 02:22:26 -0700 Subject: [PATCH 03/64] Categoricals broken --- src/tree/gpu_hist/evaluate_splits.cuh | 2 +- src/tree/gpu_hist/evaluator.cu | 73 +++---- src/tree/gpu_hist/histogram.cu | 9 - src/tree/updater_gpu_hist.cu | 223 +++++++++++++--------- tests/cpp/tree/gpu_hist/test_histogram.cu | 1 - tests/cpp/tree/test_gpu_hist.cu | 45 +++-- 6 files changed, 186 insertions(+), 167 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index b03fd7b41b51..7d792051e5be 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -92,7 +92,7 @@ class GPUHistEvaluator { } /** - * \brief Get sorted index storage based on the left node of inputs . + * \brief Get sorted index storage based on the left node of inputs. */ auto SortedIdx(EvaluateSplitInputs left) { if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index bc2027489131..381ef8fbb349 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -21,55 +21,36 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, int32_t device) { param_ = param; tree_evaluator_ = TreeEvaluator{param, n_features, device}; - if (cuts.HasCategorical() && !task.UseOneHot()) { + if (cuts.HasCategorical()) { dh::XGBCachingDeviceAllocator alloc; - auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); - auto beg = thrust::make_counting_iterator(1ul); - auto end = thrust::make_counting_iterator(ptrs.size()); - auto to_onehot = param.max_cat_to_onehot; - // This condition avoids sort-based split function calls if the users want - // onehot-encoding-based splits. - // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x. - has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) { - auto idx = i - 1; - if (common::IsCat(ft, idx)) { - auto n_bins = ptrs[i] - ptrs[idx]; - bool use_sort = !common::UseOneHot(n_bins, to_onehot, task); - return use_sort; - } - return false; - }); - - if (has_sort_) { - auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); - CHECK_NE(bit_storage_size, 0); - // We need to allocate for all nodes since the updater can grow the tree layer by - // layer, all nodes in the same layer must be preserved until that layer is - // finished. We can allocate one layer at a time, but the best case is reducing the - // size of the bitset by about a half, at the cost of invoking CUDA malloc many more - // times than necessary. - split_cats_.resize(param.MaxNodes() * bit_storage_size); - h_split_cats_.resize(split_cats_.size()); - dh::safe_cuda( - cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); + auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); + CHECK_NE(bit_storage_size, 0); + // We need to allocate for all nodes since the updater can grow the tree layer by + // layer, all nodes in the same layer must be preserved until that layer is + // finished. We can allocate one layer at a time, but the best case is reducing the + // size of the bitset by about a half, at the cost of invoking CUDA malloc many more + // times than necessary. + split_cats_.resize(param.MaxNodes() * bit_storage_size); + h_split_cats_.resize(split_cats_.size()); + dh::safe_cuda( + cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); - cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. - sort_input_.resize(cat_sorted_idx_.size()); + cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. + sort_input_.resize(cat_sorted_idx_.size()); - /** - * cache feature index binary search result - */ - feature_idx_.resize(cat_sorted_idx_.size()); - auto d_fidxes = dh::ToSpan(feature_idx_); - auto it = thrust::make_counting_iterator(0ul); - auto values = cuts.cut_values_.ConstDeviceSpan(); - auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); - thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), - feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) { - auto fidx = dh::SegmentId(ptrs, i); - return fidx; - }); - } + /** + * cache feature index binary search result + */ + feature_idx_.resize(cat_sorted_idx_.size()); + auto d_fidxes = dh::ToSpan(feature_idx_); + auto it = thrust::make_counting_iterator(0ul); + auto values = cuts.cut_values_.ConstDeviceSpan(); + auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); + thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(), + [=] XGBOOST_DEVICE(size_t i) { + auto fidx = dh::SegmentId(ptrs, i); + return fidx; + }); } } diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 791363a05cdd..efb08d5e44e2 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -247,15 +247,6 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix, dh::safe_cuda(cudaGetLastError()); } -template void BuildGradientHistogram( - EllpackDeviceAccessor const& matrix, - FeatureGroupsAccessor const& feature_groups, - common::Span gpair, - common::Span ridx, - common::Span histogram, - HistRounding rounding, - bool force_global_memory); - template void BuildGradientHistogram( EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 2340687983a8..2cd9d4babeb1 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -57,7 +57,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); #endif // !defined(GTEST_TEST) /** - * \struct DeviceHistogram + * \struct DeviceHistogramStorage * * \summary Data storage for node histograms on device. Automatically expands. * @@ -67,12 +67,18 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); * \author Rory * \date 28/07/2018 */ -template -class DeviceHistogram { +template +class DeviceHistogramStorage { private: /*! \brief Map nidx to starting index of its histogram. */ std::map nidx_map_; + // Large buffer of zeroed memory, caches histograms dh::device_vector data_; + // If we run out of storage allocate one histogram at a time + // in overflow. Not cached, overwritten when a new histogram + // is requested + dh::device_vector overflow_; + std::map overflow_nidx_map_; int n_bins_; int device_id_; static constexpr size_t kNumItemsInGradientSum = @@ -81,6 +87,8 @@ class DeviceHistogram { "Number of items in gradient type should be 2."); public: + // Start with about 16mb + DeviceHistogramStorage() { data_.reserve(1 << 22); } void Init(int device_id, int n_bins) { this->n_bins_ = n_bins; this->device_id_ = device_id; @@ -91,21 +99,53 @@ class DeviceHistogram { dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); nidx_map_.clear(); + overflow_nidx_map_.clear(); } bool HistogramExists(int nidx) const { - return nidx_map_.find(nidx) != nidx_map_.cend(); + return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); } int Bins() const { return n_bins_; } - size_t HistogramSize() const { - return n_bins_ * kNumItemsInGradientSum; - } + size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } + dh::device_vector& Data() { return data_; } - dh::device_vector& Data() { - return data_; + void AllocateHistograms(const std::vector& new_nidxs) { + for (int nidx : new_nidxs) { + CHECK(!HistogramExists(nidx)); + } + // Number of items currently used in data + const size_t used_size = nidx_map_.size() * HistogramSize(); + const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); + if (used_size >= kStopGrowingSize) { + // Use overflow + // Delete previous entries + overflow_nidx_map_.clear(); + overflow_.resize(HistogramSize() * new_nidxs.size()); + // Zero memory + auto d_data = overflow_.data().get(); + dh::LaunchN(overflow_.size(), + [=] __device__(size_t idx) { d_data[idx] = 0.0; }); + // Append new histograms + for (int nidx : new_nidxs) { + overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); + } + } else { + CHECK_GE(data_.size(), used_size); + // Expand if necessary + if (data_.size() < new_used_size) { + data_.resize(std::max(data_.size() * 2, new_used_size)); + } + // Append new histograms + for (int nidx : new_nidxs) { + nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); + } + } + + CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); } + /* void AllocateHistogram(int nidx) { if (HistogramExists(nidx)) return; // Number of items currently used in data @@ -139,6 +179,7 @@ class DeviceHistogram { CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); } + */ /** * \summary Return pointer to histogram memory for a given node. @@ -147,9 +188,16 @@ class DeviceHistogram { */ common::Span GetNodeHistogram(int nidx) { CHECK(this->HistogramExists(nidx)); - auto ptr = data_.data().get() + nidx_map_.at(nidx); - return common::Span( - reinterpret_cast(ptr), n_bins_); + + if (nidx_map_.find(nidx) != nidx_map_.cend()) { + // Fetch from normal cache + auto ptr = data_.data().get() + nidx_map_.at(nidx); + return common::Span(reinterpret_cast(ptr), n_bins_); + } else { + // Fetch from overflow + auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); + return common::Span(reinterpret_cast(ptr), n_bins_); + } } }; @@ -166,7 +214,7 @@ struct GPUHistMakerDevice { BatchParam batch_param; std::unique_ptr row_partitioner; - DeviceHistogram hist{}; + DeviceHistogramStorage hist{}; dh::caching_device_vector d_gpair; // storage for gpair; common::Span gpair; @@ -189,8 +237,6 @@ struct GPUHistMakerDevice { std::unique_ptr sampler; std::unique_ptr feature_groups; - // Storing split categories for last node. - dh::caching_device_vector node_categories; GPUHistMakerDevice(int _device_id, EllpackPageImpl const* _page, common::Span _feature_types, bst_uint _n_rows, @@ -319,7 +365,6 @@ struct GPUHistMakerDevice { } void BuildHist(int nidx) { - hist.AllocateHistogram(nidx); auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); BuildGradientHistogram(page->GetDeviceAccessor(device_id), @@ -327,8 +372,12 @@ struct GPUHistMakerDevice { d_ridx, d_node_hist, histogram_rounding); } - void SubtractionTrick(int nidx_parent, int nidx_histogram, - int nidx_subtraction) { + // Attempt to the subtraction trick + // return true if succeeded + bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { + if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) { + return false; + } auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent); auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram); auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction); @@ -337,22 +386,18 @@ struct GPUHistMakerDevice { d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx]; }); + return true; } - bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { - // Make sure histograms are already allocated - hist.AllocateHistogram(nidx_subtraction); - return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent); - } - - void UpdatePosition(int nidx, RegTree* p_tree) { - RegTree::Node split_node = (*p_tree)[nidx]; - auto split_type = p_tree->NodeSplitType(nidx); + void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) { + RegTree::Node split_node = (*p_tree)[e.nid]; + auto split_type = p_tree->NodeSplitType(e.nid); auto d_matrix = page->GetDeviceAccessor(device_id); - auto node_cats = dh::ToSpan(node_categories); + auto node_cats = e.split.split_cats.Bits(); + row_partitioner->UpdatePosition( - nidx, split_node.LeftChild(), split_node.RightChild(), + e.nid, split_node.LeftChild(), split_node.RightChild(), [=] __device__(bst_uint ridx) { // given a row index, returns the node id it belongs to bst_float cut_value = @@ -483,13 +528,15 @@ struct GPUHistMakerDevice { row_partitioner.reset(); } - void AllReduceHist(int nidx, dh::AllReducer* reducer) { + // num histograms is the number of contiguous histograms in memory to reduce over + void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) { monitor.Start("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); - reducer->AllReduceSum( - reinterpret_cast(d_node_hist), - reinterpret_cast(d_node_hist), - page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); + reducer->AllReduceSum(reinterpret_cast(d_node_hist), + reinterpret_cast(d_node_hist), + page->Cuts().TotalBins() * + (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) * + num_histograms); monitor.Stop("AllReduce"); } @@ -497,33 +544,49 @@ struct GPUHistMakerDevice { /** * \brief Build GPU local histograms for the left and right child of some parent node */ - void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left, - int nidx_right, dh::AllReducer* reducer) { - auto build_hist_nidx = nidx_left; - auto subtraction_trick_nidx = nidx_right; - - // Decide whether to build the left histogram or right histogram - // Use sum of Hessian as a heuristic to select node with fewest training instances - bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess(); - if (fewer_right) { - std::swap(build_hist_nidx, subtraction_trick_nidx); + void BuildHistLeftRight(std::vectorconst &candidates, dh::AllReducer* reducer, const RegTree& tree) { + if(candidates.empty()) return; + // Some nodes we will manually compute histograms + // others we will do by subtraction + std::vector hist_nidx; + std::vector subtraction_nidx; + for (auto& e : candidates) { + // Decide whether to build the left histogram or right histogram + // Use sum of Hessian as a heuristic to select node with fewest training instances + bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess(); + if (fewer_right) { + hist_nidx.emplace_back(tree[e.nid].RightChild()); + subtraction_nidx.emplace_back(tree[e.nid].LeftChild()); + } else { + hist_nidx.emplace_back(tree[e.nid].LeftChild()); + subtraction_nidx.emplace_back(tree[e.nid].RightChild()); + } + } + std::vector all_new = hist_nidx; + all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end()); + // Allocate the histograms + // Guaranteed contiguous memory + hist.AllocateHistograms(all_new); + + for(auto nidx:hist_nidx){ + this->BuildHist(nidx); } - this->BuildHist(build_hist_nidx); - this->AllReduceHist(build_hist_nidx, reducer); + // Reduce all in one go + // This gives much better latency in a distributed setting + // when processing a large batch + this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size()); - // Check whether we can use the subtraction trick to calculate the other - bool do_subtraction_trick = this->CanDoSubtractionTrick( - candidate.nid, build_hist_nidx, subtraction_trick_nidx); + for (int i = 0; i < subtraction_nidx.size(); i++) { + auto build_hist_nidx = hist_nidx.at(i); + auto subtraction_trick_nidx = subtraction_nidx.at(i); + auto parent_nidx = candidates.at(i).nid; - if (do_subtraction_trick) { - // Calculate other histogram using subtraction trick - this->SubtractionTrick(candidate.nid, build_hist_nidx, - subtraction_trick_nidx); - } else { - // Calculate other histogram manually - this->BuildHist(subtraction_trick_nidx); - this->AllReduceHist(subtraction_trick_nidx, reducer); + if(!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)){ + // Calculate other histogram manually + this->BuildHist(subtraction_trick_nidx); + this->AllReduceHist(subtraction_trick_nidx, reducer, 1); + } } } @@ -546,27 +609,11 @@ struct GPUHistMakerDevice { CHECK_LT(candidate.split.fvalue, std::numeric_limits::max()) << "Categorical feature value too large."; std::vector split_cats; - if (candidate.split.split_cats.Bits().empty()) { - if (common::InvalidCat(candidate.split.fvalue)) { - common::InvalidCategory(); - } - auto cat = common::AsCat(candidate.split.fvalue); - split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0); - common::CatBitField cats_bits(split_cats); - cats_bits.Set(cat); - dh::CopyToD(split_cats, &node_categories); - } else { - auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); - auto max_cat = candidate.split.MaxCat(); - split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); - CHECK_LE(split_cats.size(), h_cats.size()); - std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); - - node_categories.resize(candidate.split.split_cats.Bits().size()); - dh::safe_cuda(cudaMemcpyAsync( - node_categories.data().get(), candidate.split.split_cats.Data(), - candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice)); - } + auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); + auto max_cat = candidate.split.MaxCat(); + split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); + CHECK_LE(split_cats.size(), h_cats.size()); + std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); tree.ExpandCategorical( candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir, @@ -598,8 +645,9 @@ struct GPUHistMakerDevice { GradientPairPrecise{}, thrust::plus{}); rabit::Allreduce(reinterpret_cast(&root_sum), 2); + hist.AllocateHistograms({kRootNIdx}); this->BuildHist(kRootNIdx); - this->AllReduceHist(kRootNIdx, reducer); + this->AllReduceHist(kRootNIdx, reducer, 1); // Remember root stats node_sum_gradients[kRootNIdx] = root_sum; @@ -638,6 +686,7 @@ struct GPUHistMakerDevice { std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set), [&](const auto& e) { return driver.IsChildValid(e); }); + auto new_candidates = pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); @@ -646,22 +695,16 @@ struct GPUHistMakerDevice { // Update position is only run when child is valid, instead of right after apply // split (as in approx tree method). Hense we have the finalise position call // in GPU Hist. - this->UpdatePosition(e.nid, p_tree); + this->UpdatePosition(e, p_tree); monitor.Stop("UpdatePosition"); } - for (auto i = 0ull; i < filtered_expand_set.size(); i++) { - auto candidate = expand_set.at(i); - int left_child_nidx = tree[candidate.nid].LeftChild(); - int right_child_nidx = tree[candidate.nid].RightChild(); - - monitor.Start("BuildHist"); - this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); - monitor.Stop("BuildHist"); - } + monitor.Start("BuildHist"); + this->BuildHistLeftRight(filtered_expand_set, reducer, tree); + monitor.Stop("BuildHist"); for (auto i = 0ull; i < filtered_expand_set.size(); i++) { - auto candidate = expand_set.at(i); + auto candidate = filtered_expand_set.at(i); int left_child_nidx = tree[candidate.nid].LeftChild(); int right_child_nidx = tree[candidate.nid].RightChild(); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 3b543a48d7cc..75d97b681a61 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -95,7 +95,6 @@ TEST(Histogram, GPUDeterministic) { std::vector shm_sizes{48 * 1024, 64 * 1024, 160 * 1024}; for (bool is_dense : is_dense_array) { for (int shm_size : shm_sizes) { - TestDeterministicHistogram(is_dense, shm_size); TestDeterministicHistogram(is_dense, shm_size); } } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 883537863307..bdabbbcb38c2 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -29,29 +29,38 @@ TEST(GpuHist, DeviceHistogram) { constexpr size_t kNBins = 128; constexpr size_t kNNodes = 4; constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; - DeviceHistogram histogram; + DeviceHistogramStorage histogram; histogram.Init(0, kNBins); - for (size_t i = 0; i < kNNodes; ++i) { - histogram.AllocateHistogram(i); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms({i}); } histogram.Reset(); ASSERT_EQ(histogram.Data().size(), kStopGrowing); // Use allocated memory but do not erase nidx_map. - for (size_t i = 0; i < kNNodes; ++i) { - histogram.AllocateHistogram(i); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms({i}); } - for (size_t i = 0; i < kNNodes; ++i) { + for (int i = 0; i < kNNodes; ++i) { ASSERT_TRUE(histogram.HistogramExists(i)); } - // Erase existing nidx_map. - for (size_t i = kNNodes; i < kNNodes * 2; ++i) { - histogram.AllocateHistogram(i); - } - for (size_t i = 0; i < kNNodes; ++i) { - ASSERT_FALSE(histogram.HistogramExists(i)); + // Add two new nodes + histogram.AllocateHistograms({kNNodes}); + histogram.AllocateHistograms({kNNodes+1}); + + // Old cached nodes should still exist + for (int i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); } + + // Should be deleted + ASSERT_FALSE(histogram.HistogramExists({kNNodes})); + // Most recent node should exist + ASSERT_TRUE(histogram.HistogramExists({kNNodes + 1})); + + // Add same node again - should fail + EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes+1});); } std::vector GetHostHistGpair() { @@ -95,9 +104,9 @@ void TestBuildHist(bool use_shared_memory_histograms) { thrust::host_vector h_gidx_buffer (page->gidx_buffer.HostVector()); maker.row_partitioner.reset(new RowPartitioner(0, kNRows)); - maker.hist.AllocateHistogram(0); + maker.hist.AllocateHistograms({0}); maker.gpair = gpair.DeviceSpan(); - maker.histogram_rounding = CreateRoundingFactor(maker.gpair);; + maker.histogram_rounding = CreateRoundingFactor(maker.gpair); BuildGradientHistogram( page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0), @@ -105,7 +114,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { maker.hist.GetNodeHistogram(0), maker.histogram_rounding, !use_shared_memory_histograms); - DeviceHistogram& d_hist = maker.hist; + DeviceHistogramStorage& d_hist = maker.hist; auto node_histogram = d_hist.GetNodeHistogram(0); // d_hist.data stored in float, not gradient pair @@ -128,12 +137,10 @@ void TestBuildHist(bool use_shared_memory_histograms) { TEST(GpuHist, BuildHistGlobalMem) { TestBuildHist(false); - TestBuildHist(false); } TEST(GpuHist, BuildHistSharedMem) { TestBuildHist(true); - TestBuildHist(true); } TEST(GpuHist, ApplySplit) { @@ -173,8 +180,6 @@ TEST(GpuHist, ApplySplit) { ASSERT_EQ(tree.GetSplitCategories().size(), 1); uint32_t bits = 1u << 30; // bits: 0, 1, 0, 0, 0, ..., 0 ASSERT_EQ(tree.GetSplitCategories().back(), bits); - - ASSERT_EQ(updater.node_categories.size(), 1); } } @@ -238,7 +243,7 @@ TEST(GpuHist, EvaluateRootSplit) { // Initialize GPUHistMakerDevice::hist maker.hist.Init(0, (max_bins - 1) * kNCols); - maker.hist.AllocateHistogram(0); + maker.hist.AllocateHistograms({0}); // Each row of hist_gpair represents gpairs for one feature. // Each entry represents a bin. std::vector hist_gpair = GetHostHistGpair(); From e1fb7024fdea6224349a3fbd863ba839b3a78748 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sun, 1 May 2022 09:50:09 -0700 Subject: [PATCH 04/64] Refactor categoricals --- src/tree/gpu_hist/evaluate_splits.cu | 39 ++++++++------- src/tree/gpu_hist/evaluate_splits.cuh | 6 +-- src/tree/gpu_hist/evaluator.cu | 72 +++++++++++++-------------- src/tree/updater_gpu_hist.cu | 43 +++++----------- tests/cpp/tree/test_gpu_hist.cu | 2 - 5 files changed, 72 insertions(+), 90 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index ce8b13d0def2..7fba1902b881 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -273,12 +273,19 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a, * \brief Set the bits for categorical splits based on the split threshold. */ template -__device__ void SortBasedSplit(EvaluateSplitInputs const &input, +__device__ void SetCategoricalSplit(EvaluateSplitInputs const &input, common::Span d_sorted_idx, bst_feature_t fidx, bool is_left, common::Span out, - DeviceSplitCandidate *p_out_split) { + DeviceSplitCandidate *p_out_split, ObjInfo task) { auto &out_split = *p_out_split; out_split.split_cats = common::CatBitField{out}; + + // Simple case for one hot split + if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) { + out_split.split_cats.Set(common::AsCat(out_split.fvalue)); + return; + } + auto node_sorted_idx = is_left ? d_sorted_idx.subspan(0, input.feature_values.size()) : d_sorted_idx.subspan(input.feature_values.size(), input.feature_values.size()); @@ -313,7 +320,7 @@ void GPUHistEvaluator::EvaluateSplits( EvaluateSplitInputs left, EvaluateSplitInputs right, ObjInfo task, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits) { - if (!split_cats_.empty()) { + if (need_sort_histogram_) { this->SortHistogram(left, right, evaluator); } @@ -354,14 +361,12 @@ void GPUHistEvaluator::EvaluateSplits( template void GPUHistEvaluator::CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out) { - if (has_sort_) { - dh::CUDAEvent event; - event.Record(dh::DefaultStream()); - auto h_cats = this->HostCatStorage(input.nidx); - copy_stream_.View().Wait(event); - dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(), - cudaMemcpyDeviceToHost, copy_stream_.View())); - } + dh::CUDAEvent event; + event.Record(dh::DefaultStream()); + auto h_cats = this->HostCatStorage(input.nidx); + copy_stream_.View().Wait(event); + dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(), + cudaMemcpyDeviceToHost, copy_stream_.View())); } template @@ -378,17 +383,16 @@ void GPUHistEvaluator::EvaluateSplits(GPUExpandEntry candidate, Ob auto d_sorted_idx = this->SortedIdx(left); auto d_entries = out_entries; auto cats_out = this->DeviceCatStorage(left.nidx); - // turn candidate into entry, along with hanlding sort based split. + // turn candidate into entry, along with handling sort based split. dh::LaunchN(right.feature_set.empty() ? 1 : 2, [=] __device__(size_t i) { auto const &input = i == 0 ? left : right; auto &split = out_splits[i]; auto fidx = out_splits[i].findex; - if (split.is_cat && - !common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) { + if (split.is_cat) { bool is_left = i == 0; auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2); - SortBasedSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]); + SetCategoricalSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i], task); } float base_weight = @@ -420,9 +424,8 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( auto &split = out_split[i]; auto fidx = out_split[i].findex; - if (split.is_cat && - !common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) { - SortBasedSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]); + if (split.is_cat) { + SetCategoricalSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i], task); } float left_weight = evaluator.CalcWeight(0, input.param, GradStats{split.left_sum}); diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index b03fd7b41b51..f28aac97b417 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -58,9 +58,9 @@ class GPUHistEvaluator { dh::device_vector feature_idx_; // Training param used for evaluation TrainParam param_; - // whether the input data requires sort based split, which is more complicated so we try - // to avoid it if possible. - bool has_sort_{false}; + // Do we have any categorical features that require sorting histograms? + // use this to skip the expensive sort step + bool need_sort_histogram_ = false; // Copy the categories from device to host asynchronously. void CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out); diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index bc2027489131..6c081e1ba6df 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -30,46 +30,44 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, // This condition avoids sort-based split function calls if the users want // onehot-encoding-based splits. // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x. - has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) { - auto idx = i - 1; - if (common::IsCat(ft, idx)) { - auto n_bins = ptrs[i] - ptrs[idx]; - bool use_sort = !common::UseOneHot(n_bins, to_onehot, task); - return use_sort; - } - return false; - }); + need_sort_histogram_ = + thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) { + auto idx = i - 1; + if (common::IsCat(ft, idx)) { + auto n_bins = ptrs[i] - ptrs[idx]; + bool use_sort = !common::UseOneHot(n_bins, to_onehot, task); + return use_sort; + } + return false; + }); - if (has_sort_) { - auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); - CHECK_NE(bit_storage_size, 0); - // We need to allocate for all nodes since the updater can grow the tree layer by - // layer, all nodes in the same layer must be preserved until that layer is - // finished. We can allocate one layer at a time, but the best case is reducing the - // size of the bitset by about a half, at the cost of invoking CUDA malloc many more - // times than necessary. - split_cats_.resize(param.MaxNodes() * bit_storage_size); - h_split_cats_.resize(split_cats_.size()); - dh::safe_cuda( - cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); + auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); + CHECK_NE(bit_storage_size, 0); + // We need to allocate for all nodes since the updater can grow the tree layer by + // layer, all nodes in the same layer must be preserved until that layer is + // finished. We can allocate one layer at a time, but the best case is reducing the + // size of the bitset by about a half, at the cost of invoking CUDA malloc many more + // times than necessary. + split_cats_.resize(param.MaxNodes() * bit_storage_size); + h_split_cats_.resize(split_cats_.size()); + dh::safe_cuda( + cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); - cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. - sort_input_.resize(cat_sorted_idx_.size()); + cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. + sort_input_.resize(cat_sorted_idx_.size()); - /** - * cache feature index binary search result - */ - feature_idx_.resize(cat_sorted_idx_.size()); - auto d_fidxes = dh::ToSpan(feature_idx_); - auto it = thrust::make_counting_iterator(0ul); - auto values = cuts.cut_values_.ConstDeviceSpan(); - auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); - thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), - feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) { - auto fidx = dh::SegmentId(ptrs, i); - return fidx; - }); - } + /** + * cache feature index binary search result + */ + feature_idx_.resize(cat_sorted_idx_.size()); + auto d_fidxes = dh::ToSpan(feature_idx_); + auto it = thrust::make_counting_iterator(0ul); + auto values = cuts.cut_values_.ConstDeviceSpan(); + thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(), + [=] XGBOOST_DEVICE(size_t i) { + auto fidx = dh::SegmentId(ptrs, i); + return fidx; + }); } } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 569188fd5374..861b6e15b264 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -197,8 +197,6 @@ struct GPUHistMakerDevice { std::unique_ptr sampler; std::unique_ptr feature_groups; - // Storing split categories for last node. - dh::caching_device_vector node_categories; GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page, common::Span _feature_types, bst_uint _n_rows, @@ -354,14 +352,14 @@ struct GPUHistMakerDevice { return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent); } - void UpdatePosition(int nidx, RegTree* p_tree) { - RegTree::Node split_node = (*p_tree)[nidx]; - auto split_type = p_tree->NodeSplitType(nidx); + void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) { + RegTree::Node split_node = (*p_tree)[e.nid]; + auto split_type = p_tree->NodeSplitType(e.nid); auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); - auto node_cats = dh::ToSpan(node_categories); + auto node_cats = e.split.split_cats.Bits(); row_partitioner->UpdatePosition( - nidx, split_node.LeftChild(), split_node.RightChild(), + e.nid, split_node.LeftChild(), split_node.RightChild(), [=] __device__(bst_uint ridx) { // given a row index, returns the node id it belongs to bst_float cut_value = @@ -567,28 +565,13 @@ struct GPUHistMakerDevice { CHECK_LT(candidate.split.fvalue, std::numeric_limits::max()) << "Categorical feature value too large."; std::vector split_cats; - if (candidate.split.split_cats.Bits().empty()) { - if (common::InvalidCat(candidate.split.fvalue)) { - common::InvalidCategory(); - } - auto cat = common::AsCat(candidate.split.fvalue); - split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0); - common::CatBitField cats_bits(split_cats); - cats_bits.Set(cat); - dh::CopyToD(split_cats, &node_categories); - } else { - auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); - auto max_cat = candidate.split.MaxCat(); - split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); - CHECK_LE(split_cats.size(), h_cats.size()); - std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); - - node_categories.resize(candidate.split.split_cats.Bits().size()); - dh::safe_cuda(cudaMemcpyAsync( - node_categories.data().get(), candidate.split.split_cats.Data(), - candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice)); - } - + CHECK_GT(candidate.split.split_cats.Bits().size(), 0); + auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); + auto max_cat = candidate.split.MaxCat(); + split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); + CHECK_LE(split_cats.size(), h_cats.size()); + std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); + tree.ExpandCategorical( candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(), @@ -674,7 +657,7 @@ struct GPUHistMakerDevice { // Update position is only run when child is valid, instead of right after apply // split (as in approx tree method). Hense we have the finalise position call // in GPU Hist. - this->UpdatePosition(candidate.nid, p_tree); + this->UpdatePosition(candidate, p_tree); monitor.Stop("UpdatePosition"); monitor.Start("BuildHist"); diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index 3c93c283917a..ea5556b38fca 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -174,8 +174,6 @@ TEST(GpuHist, ApplySplit) { ASSERT_EQ(tree.GetSplitCategories().size(), 1); uint32_t bits = 1u << 30; // bits: 0, 1, 0, 0, 0, ..., 0 ASSERT_EQ(tree.GetSplitCategories().back(), bits); - - ASSERT_EQ(updater.node_categories.size(), 1); } } From dc100cfbf5bb10230875680424611a8136cb7996 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 2 May 2022 06:00:24 -0700 Subject: [PATCH 05/64] Refactor categoricals 2 --- src/common/categorical.h | 4 +- src/tree/gpu_hist/evaluate_splits.cu | 4 +- src/tree/gpu_hist/evaluate_splits.cuh | 28 ++++++++----- src/tree/gpu_hist/evaluator.cu | 15 +++---- src/tree/hist/evaluate_splits.h | 2 +- src/tree/updater_gpu_hist.cu | 4 +- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 31 ++++++++------ tests/cpp/tree/test_gpu_hist.cu | 40 ------------------- 8 files changed, 52 insertions(+), 76 deletions(-) diff --git a/src/common/categorical.h b/src/common/categorical.h index 5eff62264cf2..341a887f48a9 100644 --- a/src/common/categorical.h +++ b/src/common/categorical.h @@ -82,8 +82,8 @@ inline void InvalidCategory() { /*! * \brief Whether should we use onehot encoding for categorical data. */ -XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) { - bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot(); +XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot) { + bool use_one_hot = n_cats < max_cat_to_onehot; return use_one_hot; } diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 7fba1902b881..2966b84e75af 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -241,7 +241,7 @@ __global__ void EvaluateSplitsKernel( if (common::IsCat(inputs.feature_types, fidx)) { auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx]; - if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot, task)) { + if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot)) { EvaluateFeature(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); } else { @@ -281,7 +281,7 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs const &inp out_split.split_cats = common::CatBitField{out}; // Simple case for one hot split - if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) { + if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) { out_split.split_cats.Set(common::AsCat(out_split.fvalue)); return; } diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index f28aac97b417..67e56426217a 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -61,6 +61,9 @@ class GPUHistEvaluator { // Do we have any categorical features that require sorting histograms? // use this to skip the expensive sort step bool need_sort_histogram_ = false; + // Number of elements of categorical storage type + // needed to hold categoricals for a single mode + std::size_t node_categorical_storage_size_ = 0; // Copy the categories from device to host asynchronously. void CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out); @@ -69,12 +72,17 @@ class GPUHistEvaluator { * \brief Get host category storage of nidx for internal calculation. */ auto HostCatStorage(bst_node_t nidx) { - auto cat_bits = h_split_cats_.size() / param_.MaxNodes(); + + std::size_t min_size=(nidx+2)*node_categorical_storage_size_; + if(h_split_cats_.size(){h_split_cats_}.subspan(nidx * cat_bits, cat_bits); + auto cats_out = common::Span{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_); return cats_out; } - auto cats_out = common::Span{h_split_cats_}.subspan(nidx * cat_bits, cat_bits * 2); + auto cats_out = common::Span{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2); return cats_out; } @@ -82,12 +90,15 @@ class GPUHistEvaluator { * \brief Get device category storage of nidx for internal calculation. */ auto DeviceCatStorage(bst_node_t nidx) { - auto cat_bits = split_cats_.size() / param_.MaxNodes(); + std::size_t min_size=(nidx+2)*node_categorical_storage_size_; + if(split_cats_.size() ft, ObjInfo task, + void Reset(common::HistogramCuts const &cuts, common::Span ft, bst_feature_t n_features, TrainParam const ¶m, int32_t device); /** @@ -123,8 +134,7 @@ class GPUHistEvaluator { */ common::Span GetHostNodeCats(bst_node_t nidx) const { copy_stream_.View().Sync(); - auto cat_bits = h_split_cats_.size() / param_.MaxNodes(); - auto cats_out = common::Span{h_split_cats_}.subspan(nidx * cat_bits, cat_bits); + auto cats_out = common::Span{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_); return cats_out; } /** diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 6c081e1ba6df..777b017be24e 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -16,12 +16,12 @@ namespace xgboost { namespace tree { template void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, - common::Span ft, ObjInfo task, + common::Span ft, bst_feature_t n_features, TrainParam const ¶m, int32_t device) { param_ = param; tree_evaluator_ = TreeEvaluator{param, n_features, device}; - if (cuts.HasCategorical() && !task.UseOneHot()) { + if (cuts.HasCategorical()) { dh::XGBCachingDeviceAllocator alloc; auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); auto beg = thrust::make_counting_iterator(1ul); @@ -35,21 +35,22 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, auto idx = i - 1; if (common::IsCat(ft, idx)) { auto n_bins = ptrs[i] - ptrs[idx]; - bool use_sort = !common::UseOneHot(n_bins, to_onehot, task); + bool use_sort = !common::UseOneHot(n_bins, to_onehot); return use_sort; } return false; }); - auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); - CHECK_NE(bit_storage_size, 0); + node_categorical_storage_size_ = + common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); + CHECK_NE(node_categorical_storage_size_, 0); // We need to allocate for all nodes since the updater can grow the tree layer by // layer, all nodes in the same layer must be preserved until that layer is // finished. We can allocate one layer at a time, but the best case is reducing the // size of the bitset by about a half, at the cost of invoking CUDA malloc many more // times than necessary. - split_cats_.resize(param.MaxNodes() * bit_storage_size); - h_split_cats_.resize(split_cats_.size()); + split_cats_.resize(node_categorical_storage_size_); + h_split_cats_.resize(node_categorical_storage_size_); dh::safe_cuda( cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index 4e445a0680e5..8a61ea809c04 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -244,7 +244,7 @@ template class HistEvaluator { } if (is_cat) { auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx]; - if (common::UseOneHot(n_bins, param_.max_cat_to_onehot, task_)) { + if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) { EnumerateSplit<+1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best); EnumerateSplit<-1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best); } else { diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 861b6e15b264..8ee6f43f78f5 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -241,7 +241,7 @@ struct GPUHistMakerDevice { param.colsample_bytree); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); - this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param, + this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id); this->interaction_constraints.Reset(); @@ -571,7 +571,7 @@ struct GPUHistMakerDevice { split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); CHECK_LE(split_cats.size(), h_cats.size()); std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); - + tree.ExpandCategorical( candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(), diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 0cbfc9f2a6cf..2243cb4dda90 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -24,14 +24,16 @@ void TestEvaluateSingleSplit(bool is_categorical) { TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; + common::HistogramCuts cuts; + cuts.cut_values_.HostVector() = std::vector{1.0, 2.0, 11.0, 12.0}; + cuts.cut_ptrs_.HostVector() = std::vector{0, 2, 4}; + cuts.min_vals_.HostVector() = std::vector{0.0, 0.0}; + cuts.cut_ptrs_.SetDevice(0); + cuts.cut_values_.SetDevice(0); + cuts.min_vals_.SetDevice(0); thrust::device_vector feature_set = std::vector{0, 1}; - thrust::device_vector feature_segments = - std::vector{0, 2, 4}; - thrust::device_vector feature_values = - std::vector{1.0, 2.0, 11.0, 12.0}; - thrust::device_vector feature_min_values = - std::vector{0.0, 0.0}; + // Setup gradients so that second feature gets higher gain thrust::device_vector feature_histogram = std::vector{ @@ -42,21 +44,25 @@ void TestEvaluateSingleSplit(bool is_categorical) { FeatureType::kCategorical); common::Span d_feature_types; if (is_categorical) { + auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(), + cuts.cut_values_.HostVector().end()); + cuts.SetCategorical(true, max_cat); d_feature_types = dh::ToSpan(feature_types); } + EvaluateSplitInputs input{1, parent_sum, param, dh::ToSpan(feature_set), d_feature_types, - dh::ToSpan(feature_segments), - dh::ToSpan(feature_values), - dh::ToSpan(feature_min_values), + cuts.cut_ptrs_.ConstDeviceSpan(), + cuts.cut_values_.ConstDeviceSpan(), + cuts.min_vals_.ConstDeviceSpan(), dh::ToSpan(feature_histogram)}; GPUHistEvaluator evaluator{ - tparam, static_cast(feature_min_values.size()), 0}; - dh::device_vector out_cats; + tparam, static_cast(feature_set.size()), 0}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, 0, ObjInfo{ObjInfo::kRegression}).split; @@ -264,8 +270,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { cuts_.cut_values_.SetDevice(0); cuts_.min_vals_.SetDevice(0); - ObjInfo task{ObjInfo::kRegression}; - evaluator.Reset(cuts_, dh::ToSpan(ft), task, info_.num_col_, param_, 0); + evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, 0); dh::device_vector d_hist(hist_[0].size()); auto node_hist = hist_[0]; diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index ea5556b38fca..2f3cc9c7d950 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -137,46 +137,6 @@ TEST(GpuHist, BuildHistSharedMem) { TestBuildHist(true); } -TEST(GpuHist, ApplySplit) { - RegTree tree; - GPUExpandEntry candidate; - candidate.nid = 0; - candidate.left_weight = 1.0f; - candidate.right_weight = 2.0f; - candidate.base_weight = 3.0f; - candidate.split.is_cat = true; - candidate.split.fvalue = 1.0f; // at cat 1 - - size_t n_rows = 10; - size_t n_cols = 10; - - auto m = RandomDataGenerator{n_rows, n_cols, 0}.GenerateDMatrix(true); - GenericParameter p; - p.InitAllowUnknown(Args{}); - - TrainParam tparam; - tparam.InitAllowUnknown(Args{}); - BatchParam bparam; - bparam.gpu_id = 0; - bparam.max_bin = 3; - Context ctx{CreateEmptyGenericParam(0)}; - - for (auto& ellpack : m->GetBatches(bparam)){ - auto impl = ellpack.Impl(); - HostDeviceVector feature_types(10, FeatureType::kCategorical); - feature_types.SetDevice(bparam.gpu_id); - tree::GPUHistMakerDevice updater( - &ctx, impl, feature_types.ConstDeviceSpan(), n_rows, tparam, 0, n_cols, bparam); - updater.ApplySplit(candidate, &tree); - - ASSERT_EQ(tree.GetSplitTypes().size(), 3); - ASSERT_EQ(tree.GetSplitTypes()[0], FeatureType::kCategorical); - ASSERT_EQ(tree.GetSplitCategories().size(), 1); - uint32_t bits = 1u << 30; // bits: 0, 1, 0, 0, 0, ..., 0 - ASSERT_EQ(tree.GetSplitCategories().back(), bits); - } -} - HistogramCutsWrapper GetHostCutMatrix () { HistogramCutsWrapper cmat; cmat.SetPtrs({0, 3, 6, 9, 12, 15, 18, 21, 24}); From bc744585f7832381fd525090718286b7f93b6d09 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 2 May 2022 06:21:51 -0700 Subject: [PATCH 06/64] Skip copy if no categoricals --- src/tree/gpu_hist/evaluate_splits.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 2966b84e75af..5326b103d2d7 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -361,6 +361,7 @@ void GPUHistEvaluator::EvaluateSplits( template void GPUHistEvaluator::CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out) { + if (cats_out.empty()) return; dh::CUDAEvent event; event.Record(dh::DefaultStream()); auto h_cats = this->HostCatStorage(input.nidx); From c4f8eac8996262d8447e73ca24b88569e34fc5c2 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 5 May 2022 04:35:32 -0700 Subject: [PATCH 07/64] Review comment --- .gitignore | 5 ++++- src/tree/gpu_hist/evaluator.cu | 5 ----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index e847342b19bd..20b92c057e1a 100644 --- a/.gitignore +++ b/.gitignore @@ -130,4 +130,7 @@ credentials.csv # Visual Studio code + extensions .vscode .metals -.bloop \ No newline at end of file +.bloop + +# hypothesis python tests +.hypothesis \ No newline at end of file diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 777b017be24e..aaf35243b2f5 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -44,11 +44,6 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, node_categorical_storage_size_ = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); CHECK_NE(node_categorical_storage_size_, 0); - // We need to allocate for all nodes since the updater can grow the tree layer by - // layer, all nodes in the same layer must be preserved until that layer is - // finished. We can allocate one layer at a time, but the best case is reducing the - // size of the bitset by about a half, at the cost of invoking CUDA malloc many more - // times than necessary. split_cats_.resize(node_categorical_storage_size_); h_split_cats_.resize(node_categorical_storage_size_); dh::safe_cuda( From a1cddaabbf93bb0be86bfc293dea5a84e233d719 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 5 May 2022 07:30:55 -0700 Subject: [PATCH 08/64] Revert "Categoricals broken" This reverts commit 80a3e78f9e1dcbf2a78f6572897453c61afd60b0. --- src/tree/gpu_hist/evaluate_splits.cuh | 2 +- src/tree/gpu_hist/evaluator.cu | 73 ++++--- src/tree/gpu_hist/histogram.cu | 9 + src/tree/updater_gpu_hist.cu | 223 +++++++++------------- tests/cpp/tree/gpu_hist/test_histogram.cu | 1 + tests/cpp/tree/test_gpu_hist.cu | 45 ++--- 6 files changed, 167 insertions(+), 186 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 7d792051e5be..b03fd7b41b51 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -92,7 +92,7 @@ class GPUHistEvaluator { } /** - * \brief Get sorted index storage based on the left node of inputs. + * \brief Get sorted index storage based on the left node of inputs . */ auto SortedIdx(EvaluateSplitInputs left) { if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 381ef8fbb349..bc2027489131 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -21,36 +21,55 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, int32_t device) { param_ = param; tree_evaluator_ = TreeEvaluator{param, n_features, device}; - if (cuts.HasCategorical()) { + if (cuts.HasCategorical() && !task.UseOneHot()) { dh::XGBCachingDeviceAllocator alloc; - auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); - CHECK_NE(bit_storage_size, 0); - // We need to allocate for all nodes since the updater can grow the tree layer by - // layer, all nodes in the same layer must be preserved until that layer is - // finished. We can allocate one layer at a time, but the best case is reducing the - // size of the bitset by about a half, at the cost of invoking CUDA malloc many more - // times than necessary. - split_cats_.resize(param.MaxNodes() * bit_storage_size); - h_split_cats_.resize(split_cats_.size()); - dh::safe_cuda( - cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); + auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); + auto beg = thrust::make_counting_iterator(1ul); + auto end = thrust::make_counting_iterator(ptrs.size()); + auto to_onehot = param.max_cat_to_onehot; + // This condition avoids sort-based split function calls if the users want + // onehot-encoding-based splits. + // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x. + has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) { + auto idx = i - 1; + if (common::IsCat(ft, idx)) { + auto n_bins = ptrs[i] - ptrs[idx]; + bool use_sort = !common::UseOneHot(n_bins, to_onehot, task); + return use_sort; + } + return false; + }); - cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. - sort_input_.resize(cat_sorted_idx_.size()); + if (has_sort_) { + auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1); + CHECK_NE(bit_storage_size, 0); + // We need to allocate for all nodes since the updater can grow the tree layer by + // layer, all nodes in the same layer must be preserved until that layer is + // finished. We can allocate one layer at a time, but the best case is reducing the + // size of the bitset by about a half, at the cost of invoking CUDA malloc many more + // times than necessary. + split_cats_.resize(param.MaxNodes() * bit_storage_size); + h_split_cats_.resize(split_cats_.size()); + dh::safe_cuda( + cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST))); - /** - * cache feature index binary search result - */ - feature_idx_.resize(cat_sorted_idx_.size()); - auto d_fidxes = dh::ToSpan(feature_idx_); - auto it = thrust::make_counting_iterator(0ul); - auto values = cuts.cut_values_.ConstDeviceSpan(); - auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); - thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(), - [=] XGBOOST_DEVICE(size_t i) { - auto fidx = dh::SegmentId(ptrs, i); - return fidx; - }); + cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time. + sort_input_.resize(cat_sorted_idx_.size()); + + /** + * cache feature index binary search result + */ + feature_idx_.resize(cat_sorted_idx_.size()); + auto d_fidxes = dh::ToSpan(feature_idx_); + auto it = thrust::make_counting_iterator(0ul); + auto values = cuts.cut_values_.ConstDeviceSpan(); + auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); + thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), + feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) { + auto fidx = dh::SegmentId(ptrs, i); + return fidx; + }); + } } } diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index efb08d5e44e2..791363a05cdd 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -247,6 +247,15 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix, dh::safe_cuda(cudaGetLastError()); } +template void BuildGradientHistogram( + EllpackDeviceAccessor const& matrix, + FeatureGroupsAccessor const& feature_groups, + common::Span gpair, + common::Span ridx, + common::Span histogram, + HistRounding rounding, + bool force_global_memory); + template void BuildGradientHistogram( EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 2cd9d4babeb1..2340687983a8 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -57,7 +57,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); #endif // !defined(GTEST_TEST) /** - * \struct DeviceHistogramStorage + * \struct DeviceHistogram * * \summary Data storage for node histograms on device. Automatically expands. * @@ -67,18 +67,12 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); * \author Rory * \date 28/07/2018 */ -template -class DeviceHistogramStorage { +template +class DeviceHistogram { private: /*! \brief Map nidx to starting index of its histogram. */ std::map nidx_map_; - // Large buffer of zeroed memory, caches histograms dh::device_vector data_; - // If we run out of storage allocate one histogram at a time - // in overflow. Not cached, overwritten when a new histogram - // is requested - dh::device_vector overflow_; - std::map overflow_nidx_map_; int n_bins_; int device_id_; static constexpr size_t kNumItemsInGradientSum = @@ -87,8 +81,6 @@ class DeviceHistogramStorage { "Number of items in gradient type should be 2."); public: - // Start with about 16mb - DeviceHistogramStorage() { data_.reserve(1 << 22); } void Init(int device_id, int n_bins) { this->n_bins_ = n_bins; this->device_id_ = device_id; @@ -99,53 +91,21 @@ class DeviceHistogramStorage { dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); nidx_map_.clear(); - overflow_nidx_map_.clear(); } bool HistogramExists(int nidx) const { - return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); + return nidx_map_.find(nidx) != nidx_map_.cend(); } int Bins() const { return n_bins_; } - size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } - dh::device_vector& Data() { return data_; } - - void AllocateHistograms(const std::vector& new_nidxs) { - for (int nidx : new_nidxs) { - CHECK(!HistogramExists(nidx)); - } - // Number of items currently used in data - const size_t used_size = nidx_map_.size() * HistogramSize(); - const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); - if (used_size >= kStopGrowingSize) { - // Use overflow - // Delete previous entries - overflow_nidx_map_.clear(); - overflow_.resize(HistogramSize() * new_nidxs.size()); - // Zero memory - auto d_data = overflow_.data().get(); - dh::LaunchN(overflow_.size(), - [=] __device__(size_t idx) { d_data[idx] = 0.0; }); - // Append new histograms - for (int nidx : new_nidxs) { - overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); - } - } else { - CHECK_GE(data_.size(), used_size); - // Expand if necessary - if (data_.size() < new_used_size) { - data_.resize(std::max(data_.size() * 2, new_used_size)); - } - // Append new histograms - for (int nidx : new_nidxs) { - nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); - } - } + size_t HistogramSize() const { + return n_bins_ * kNumItemsInGradientSum; + } - CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); + dh::device_vector& Data() { + return data_; } - /* void AllocateHistogram(int nidx) { if (HistogramExists(nidx)) return; // Number of items currently used in data @@ -179,7 +139,6 @@ class DeviceHistogramStorage { CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize()); } - */ /** * \summary Return pointer to histogram memory for a given node. @@ -188,16 +147,9 @@ class DeviceHistogramStorage { */ common::Span GetNodeHistogram(int nidx) { CHECK(this->HistogramExists(nidx)); - - if (nidx_map_.find(nidx) != nidx_map_.cend()) { - // Fetch from normal cache - auto ptr = data_.data().get() + nidx_map_.at(nidx); - return common::Span(reinterpret_cast(ptr), n_bins_); - } else { - // Fetch from overflow - auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); - return common::Span(reinterpret_cast(ptr), n_bins_); - } + auto ptr = data_.data().get() + nidx_map_.at(nidx); + return common::Span( + reinterpret_cast(ptr), n_bins_); } }; @@ -214,7 +166,7 @@ struct GPUHistMakerDevice { BatchParam batch_param; std::unique_ptr row_partitioner; - DeviceHistogramStorage hist{}; + DeviceHistogram hist{}; dh::caching_device_vector d_gpair; // storage for gpair; common::Span gpair; @@ -237,6 +189,8 @@ struct GPUHistMakerDevice { std::unique_ptr sampler; std::unique_ptr feature_groups; + // Storing split categories for last node. + dh::caching_device_vector node_categories; GPUHistMakerDevice(int _device_id, EllpackPageImpl const* _page, common::Span _feature_types, bst_uint _n_rows, @@ -365,6 +319,7 @@ struct GPUHistMakerDevice { } void BuildHist(int nidx) { + hist.AllocateHistogram(nidx); auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); BuildGradientHistogram(page->GetDeviceAccessor(device_id), @@ -372,12 +327,8 @@ struct GPUHistMakerDevice { d_ridx, d_node_hist, histogram_rounding); } - // Attempt to the subtraction trick - // return true if succeeded - bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { - if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) { - return false; - } + void SubtractionTrick(int nidx_parent, int nidx_histogram, + int nidx_subtraction) { auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent); auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram); auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction); @@ -386,18 +337,22 @@ struct GPUHistMakerDevice { d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx]; }); - return true; } - void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) { - RegTree::Node split_node = (*p_tree)[e.nid]; - auto split_type = p_tree->NodeSplitType(e.nid); + bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { + // Make sure histograms are already allocated + hist.AllocateHistogram(nidx_subtraction); + return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent); + } + + void UpdatePosition(int nidx, RegTree* p_tree) { + RegTree::Node split_node = (*p_tree)[nidx]; + auto split_type = p_tree->NodeSplitType(nidx); auto d_matrix = page->GetDeviceAccessor(device_id); - auto node_cats = e.split.split_cats.Bits(); - + auto node_cats = dh::ToSpan(node_categories); row_partitioner->UpdatePosition( - e.nid, split_node.LeftChild(), split_node.RightChild(), + nidx, split_node.LeftChild(), split_node.RightChild(), [=] __device__(bst_uint ridx) { // given a row index, returns the node id it belongs to bst_float cut_value = @@ -528,15 +483,13 @@ struct GPUHistMakerDevice { row_partitioner.reset(); } - // num histograms is the number of contiguous histograms in memory to reduce over - void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) { + void AllReduceHist(int nidx, dh::AllReducer* reducer) { monitor.Start("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); - reducer->AllReduceSum(reinterpret_cast(d_node_hist), - reinterpret_cast(d_node_hist), - page->Cuts().TotalBins() * - (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) * - num_histograms); + reducer->AllReduceSum( + reinterpret_cast(d_node_hist), + reinterpret_cast(d_node_hist), + page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); monitor.Stop("AllReduce"); } @@ -544,49 +497,33 @@ struct GPUHistMakerDevice { /** * \brief Build GPU local histograms for the left and right child of some parent node */ - void BuildHistLeftRight(std::vectorconst &candidates, dh::AllReducer* reducer, const RegTree& tree) { - if(candidates.empty()) return; - // Some nodes we will manually compute histograms - // others we will do by subtraction - std::vector hist_nidx; - std::vector subtraction_nidx; - for (auto& e : candidates) { - // Decide whether to build the left histogram or right histogram - // Use sum of Hessian as a heuristic to select node with fewest training instances - bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess(); - if (fewer_right) { - hist_nidx.emplace_back(tree[e.nid].RightChild()); - subtraction_nidx.emplace_back(tree[e.nid].LeftChild()); - } else { - hist_nidx.emplace_back(tree[e.nid].LeftChild()); - subtraction_nidx.emplace_back(tree[e.nid].RightChild()); - } - } - std::vector all_new = hist_nidx; - all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end()); - // Allocate the histograms - // Guaranteed contiguous memory - hist.AllocateHistograms(all_new); - - for(auto nidx:hist_nidx){ - this->BuildHist(nidx); + void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left, + int nidx_right, dh::AllReducer* reducer) { + auto build_hist_nidx = nidx_left; + auto subtraction_trick_nidx = nidx_right; + + // Decide whether to build the left histogram or right histogram + // Use sum of Hessian as a heuristic to select node with fewest training instances + bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess(); + if (fewer_right) { + std::swap(build_hist_nidx, subtraction_trick_nidx); } - // Reduce all in one go - // This gives much better latency in a distributed setting - // when processing a large batch - this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size()); + this->BuildHist(build_hist_nidx); + this->AllReduceHist(build_hist_nidx, reducer); - for (int i = 0; i < subtraction_nidx.size(); i++) { - auto build_hist_nidx = hist_nidx.at(i); - auto subtraction_trick_nidx = subtraction_nidx.at(i); - auto parent_nidx = candidates.at(i).nid; + // Check whether we can use the subtraction trick to calculate the other + bool do_subtraction_trick = this->CanDoSubtractionTrick( + candidate.nid, build_hist_nidx, subtraction_trick_nidx); - if(!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)){ - // Calculate other histogram manually - this->BuildHist(subtraction_trick_nidx); - this->AllReduceHist(subtraction_trick_nidx, reducer, 1); - } + if (do_subtraction_trick) { + // Calculate other histogram using subtraction trick + this->SubtractionTrick(candidate.nid, build_hist_nidx, + subtraction_trick_nidx); + } else { + // Calculate other histogram manually + this->BuildHist(subtraction_trick_nidx); + this->AllReduceHist(subtraction_trick_nidx, reducer); } } @@ -609,11 +546,27 @@ struct GPUHistMakerDevice { CHECK_LT(candidate.split.fvalue, std::numeric_limits::max()) << "Categorical feature value too large."; std::vector split_cats; - auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); - auto max_cat = candidate.split.MaxCat(); - split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); - CHECK_LE(split_cats.size(), h_cats.size()); - std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); + if (candidate.split.split_cats.Bits().empty()) { + if (common::InvalidCat(candidate.split.fvalue)) { + common::InvalidCategory(); + } + auto cat = common::AsCat(candidate.split.fvalue); + split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0); + common::CatBitField cats_bits(split_cats); + cats_bits.Set(cat); + dh::CopyToD(split_cats, &node_categories); + } else { + auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); + auto max_cat = candidate.split.MaxCat(); + split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); + CHECK_LE(split_cats.size(), h_cats.size()); + std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); + + node_categories.resize(candidate.split.split_cats.Bits().size()); + dh::safe_cuda(cudaMemcpyAsync( + node_categories.data().get(), candidate.split.split_cats.Data(), + candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice)); + } tree.ExpandCategorical( candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir, @@ -645,9 +598,8 @@ struct GPUHistMakerDevice { GradientPairPrecise{}, thrust::plus{}); rabit::Allreduce(reinterpret_cast(&root_sum), 2); - hist.AllocateHistograms({kRootNIdx}); this->BuildHist(kRootNIdx); - this->AllReduceHist(kRootNIdx, reducer, 1); + this->AllReduceHist(kRootNIdx, reducer); // Remember root stats node_sum_gradients[kRootNIdx] = root_sum; @@ -686,7 +638,6 @@ struct GPUHistMakerDevice { std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set), [&](const auto& e) { return driver.IsChildValid(e); }); - auto new_candidates = pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); @@ -695,16 +646,22 @@ struct GPUHistMakerDevice { // Update position is only run when child is valid, instead of right after apply // split (as in approx tree method). Hense we have the finalise position call // in GPU Hist. - this->UpdatePosition(e, p_tree); + this->UpdatePosition(e.nid, p_tree); monitor.Stop("UpdatePosition"); } - monitor.Start("BuildHist"); - this->BuildHistLeftRight(filtered_expand_set, reducer, tree); - monitor.Stop("BuildHist"); + for (auto i = 0ull; i < filtered_expand_set.size(); i++) { + auto candidate = expand_set.at(i); + int left_child_nidx = tree[candidate.nid].LeftChild(); + int right_child_nidx = tree[candidate.nid].RightChild(); + + monitor.Start("BuildHist"); + this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); + monitor.Stop("BuildHist"); + } for (auto i = 0ull; i < filtered_expand_set.size(); i++) { - auto candidate = filtered_expand_set.at(i); + auto candidate = expand_set.at(i); int left_child_nidx = tree[candidate.nid].LeftChild(); int right_child_nidx = tree[candidate.nid].RightChild(); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 75d97b681a61..3b543a48d7cc 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -95,6 +95,7 @@ TEST(Histogram, GPUDeterministic) { std::vector shm_sizes{48 * 1024, 64 * 1024, 160 * 1024}; for (bool is_dense : is_dense_array) { for (int shm_size : shm_sizes) { + TestDeterministicHistogram(is_dense, shm_size); TestDeterministicHistogram(is_dense, shm_size); } } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index bdabbbcb38c2..883537863307 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -29,38 +29,29 @@ TEST(GpuHist, DeviceHistogram) { constexpr size_t kNBins = 128; constexpr size_t kNNodes = 4; constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; - DeviceHistogramStorage histogram; + DeviceHistogram histogram; histogram.Init(0, kNBins); - for (int i = 0; i < kNNodes; ++i) { - histogram.AllocateHistograms({i}); + for (size_t i = 0; i < kNNodes; ++i) { + histogram.AllocateHistogram(i); } histogram.Reset(); ASSERT_EQ(histogram.Data().size(), kStopGrowing); // Use allocated memory but do not erase nidx_map. - for (int i = 0; i < kNNodes; ++i) { - histogram.AllocateHistograms({i}); + for (size_t i = 0; i < kNNodes; ++i) { + histogram.AllocateHistogram(i); } - for (int i = 0; i < kNNodes; ++i) { + for (size_t i = 0; i < kNNodes; ++i) { ASSERT_TRUE(histogram.HistogramExists(i)); } - // Add two new nodes - histogram.AllocateHistograms({kNNodes}); - histogram.AllocateHistograms({kNNodes+1}); - - // Old cached nodes should still exist - for (int i = 0; i < kNNodes; ++i) { - ASSERT_TRUE(histogram.HistogramExists(i)); + // Erase existing nidx_map. + for (size_t i = kNNodes; i < kNNodes * 2; ++i) { + histogram.AllocateHistogram(i); + } + for (size_t i = 0; i < kNNodes; ++i) { + ASSERT_FALSE(histogram.HistogramExists(i)); } - - // Should be deleted - ASSERT_FALSE(histogram.HistogramExists({kNNodes})); - // Most recent node should exist - ASSERT_TRUE(histogram.HistogramExists({kNNodes + 1})); - - // Add same node again - should fail - EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes+1});); } std::vector GetHostHistGpair() { @@ -104,9 +95,9 @@ void TestBuildHist(bool use_shared_memory_histograms) { thrust::host_vector h_gidx_buffer (page->gidx_buffer.HostVector()); maker.row_partitioner.reset(new RowPartitioner(0, kNRows)); - maker.hist.AllocateHistograms({0}); + maker.hist.AllocateHistogram(0); maker.gpair = gpair.DeviceSpan(); - maker.histogram_rounding = CreateRoundingFactor(maker.gpair); + maker.histogram_rounding = CreateRoundingFactor(maker.gpair);; BuildGradientHistogram( page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0), @@ -114,7 +105,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { maker.hist.GetNodeHistogram(0), maker.histogram_rounding, !use_shared_memory_histograms); - DeviceHistogramStorage& d_hist = maker.hist; + DeviceHistogram& d_hist = maker.hist; auto node_histogram = d_hist.GetNodeHistogram(0); // d_hist.data stored in float, not gradient pair @@ -137,10 +128,12 @@ void TestBuildHist(bool use_shared_memory_histograms) { TEST(GpuHist, BuildHistGlobalMem) { TestBuildHist(false); + TestBuildHist(false); } TEST(GpuHist, BuildHistSharedMem) { TestBuildHist(true); + TestBuildHist(true); } TEST(GpuHist, ApplySplit) { @@ -180,6 +173,8 @@ TEST(GpuHist, ApplySplit) { ASSERT_EQ(tree.GetSplitCategories().size(), 1); uint32_t bits = 1u << 30; // bits: 0, 1, 0, 0, 0, ..., 0 ASSERT_EQ(tree.GetSplitCategories().back(), bits); + + ASSERT_EQ(updater.node_categories.size(), 1); } } @@ -243,7 +238,7 @@ TEST(GpuHist, EvaluateRootSplit) { // Initialize GPUHistMakerDevice::hist maker.hist.Init(0, (max_bins - 1) * kNCols); - maker.hist.AllocateHistograms({0}); + maker.hist.AllocateHistogram(0); // Each row of hist_gpair represents gpairs for one feature. // Each entry represents a bin. std::vector hist_gpair = GetHostHistGpair(); From fd0e25e0bd2cf05f33766c7b1deb1471126f9447 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 5 May 2022 08:49:40 -0700 Subject: [PATCH 09/64] Lint --- src/tree/driver.h | 4 ++-- src/tree/updater_approx.cc | 8 +------- src/tree/updater_gpu_hist.cu | 4 ++-- src/tree/updater_quantile_hist.cc | 7 +------ 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/src/tree/driver.h b/src/tree/driver.h index 1e40cc32622f..e61255e043c7 100644 --- a/src/tree/driver.h +++ b/src/tree/driver.h @@ -57,7 +57,7 @@ class Driver { // Can a child of this entry still be expanded? // can be used to avoid extra work - bool IsChildValid(ExpandEntryT const& parent_entry){ + bool IsChildValid(ExpandEntryT const& parent_entry) { if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false; if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false; return true; @@ -100,7 +100,7 @@ class Driver { private: TrainParam param_; - std::size_t num_leaves_=1; + std::size_t num_leaves_ = 1; ExpandQueue queue_; }; } // namespace tree diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc index fc05aed0a3ee..99e7cf738200 100644 --- a/src/tree/updater_approx.cc +++ b/src/tree/updater_approx.cc @@ -184,7 +184,6 @@ class GloablApproxBuilder { Driver driver(param_); auto &tree = *p_tree; driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)}); - bst_node_t num_leaves{1}; auto expand_set = driver.Pop(); /** @@ -203,14 +202,9 @@ class GloablApproxBuilder { // candidates that can be applied. std::vector applied; for (auto const &candidate : expand_set) { - if (!candidate.IsValid(param_, num_leaves)) { - continue; - } evaluator_.ApplyTreeSplit(candidate, p_tree); applied.push_back(candidate); - num_leaves++; - int left_child_nidx = tree[candidate.nid].LeftChild(); - if (CPUExpandEntry::ChildIsValid(param_, p_tree->GetDepth(left_child_nidx), num_leaves)) { + if (driver.IsChildValid(candidate)) { valid_candidates.emplace_back(candidate); } } diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 07f1499e213f..634f2969a090 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -637,7 +637,7 @@ struct GPUHistMakerDevice { // The set of leaves that can be expanded asynchronously auto expand_set = driver.Pop(); while (!expand_set.empty()) { - for(auto & candidate: expand_set){ + for (auto& candidate : expand_set) { this->ApplySplit(candidate, p_tree); } // Get the candidates we are allowed to expand further @@ -649,7 +649,7 @@ struct GPUHistMakerDevice { auto new_candidates = pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); - for(const auto &e:filtered_expand_set){ + for (const auto& e : filtered_expand_set) { monitor.Start("UpdatePosition"); // Update position is only run when child is valid, instead of right after apply // split (as in approx tree method). Hense we have the finalise position call diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index c69f8c8dba0b..ed3dff67295a 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -196,7 +196,6 @@ void QuantileHistMaker::Builder::ExpandTree( Driver driver(param_); driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h)); auto const &tree = *p_tree; - bst_node_t num_leaves{1}; auto expand_set = driver.Pop(); while (!expand_set.empty()) { @@ -206,13 +205,9 @@ void QuantileHistMaker::Builder::ExpandTree( std::vector applied; int32_t depth = expand_set.front().depth + 1; for (auto const& candidate : expand_set) { - if (!candidate.IsValid(param_, num_leaves)) { - continue; - } evaluator_->ApplyTreeSplit(candidate, p_tree); applied.push_back(candidate); - num_leaves++; - if (CPUExpandEntry::ChildIsValid(param_, depth, num_leaves)) { + if (driver.IsChildValid(candidate)) { valid_candidates.emplace_back(candidate); } } From 56785f3168c26a248572e6edd0f6c8b8c2885bde Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 6 May 2022 05:05:15 -0700 Subject: [PATCH 10/64] Revert "Revert "Categoricals broken"" This reverts commit a1cddaabbf93bb0be86bfc293dea5a84e233d719. --- src/tree/gpu_hist/evaluate_splits.cuh | 2 +- src/tree/gpu_hist/histogram.cu | 9 - src/tree/updater_gpu_hist.cu | 202 ++++++++++++---------- tests/cpp/tree/gpu_hist/test_histogram.cu | 1 - tests/cpp/tree/test_gpu_hist.cu | 43 +++-- tests/python-gpu/test_gpu_updaters.py | 2 +- 6 files changed, 140 insertions(+), 119 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 8d5cc809a280..08b0270ee4d7 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -103,7 +103,7 @@ class GPUHistEvaluator { } /** - * \brief Get sorted index storage based on the left node of inputs . + * \brief Get sorted index storage based on the left node of inputs. */ auto SortedIdx(EvaluateSplitInputs left) { if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index 791363a05cdd..efb08d5e44e2 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -247,15 +247,6 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix, dh::safe_cuda(cudaGetLastError()); } -template void BuildGradientHistogram( - EllpackDeviceAccessor const& matrix, - FeatureGroupsAccessor const& feature_groups, - common::Span gpair, - common::Span ridx, - common::Span histogram, - HistRounding rounding, - bool force_global_memory); - template void BuildGradientHistogram( EllpackDeviceAccessor const& matrix, FeatureGroupsAccessor const& feature_groups, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 634f2969a090..964a486baf16 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -62,7 +62,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); #endif // !defined(GTEST_TEST) /** - * \struct DeviceHistogram + * \struct DeviceHistogramStorage * * \summary Data storage for node histograms on device. Automatically expands. * @@ -72,12 +72,18 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam); * \author Rory * \date 28/07/2018 */ -template -class DeviceHistogram { +template +class DeviceHistogramStorage { private: /*! \brief Map nidx to starting index of its histogram. */ std::map nidx_map_; + // Large buffer of zeroed memory, caches histograms dh::device_vector data_; + // If we run out of storage allocate one histogram at a time + // in overflow. Not cached, overwritten when a new histogram + // is requested + dh::device_vector overflow_; + std::map overflow_nidx_map_; int n_bins_; int device_id_; static constexpr size_t kNumItemsInGradientSum = @@ -86,6 +92,8 @@ class DeviceHistogram { "Number of items in gradient type should be 2."); public: + // Start with about 16mb + DeviceHistogramStorage() { data_.reserve(1 << 22); } void Init(int device_id, int n_bins) { this->n_bins_ = n_bins; this->device_id_ = device_id; @@ -93,52 +101,48 @@ class DeviceHistogram { void Reset() { auto d_data = data_.data().get(); - dh::LaunchN(data_.size(), - [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); + dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); nidx_map_.clear(); + overflow_nidx_map_.clear(); } bool HistogramExists(int nidx) const { - return nidx_map_.find(nidx) != nidx_map_.cend(); + return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); } int Bins() const { return n_bins_; } - size_t HistogramSize() const { - return n_bins_ * kNumItemsInGradientSum; - } - - dh::device_vector& Data() { - return data_; - } + size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } + dh::device_vector& Data() { return data_; } - void AllocateHistogram(int nidx) { - if (HistogramExists(nidx)) return; + void AllocateHistograms(const std::vector& new_nidxs) { + for (int nidx : new_nidxs) { + CHECK(!HistogramExists(nidx)); + } // Number of items currently used in data const size_t used_size = nidx_map_.size() * HistogramSize(); - const size_t new_used_size = used_size + HistogramSize(); - if (data_.size() >= kStopGrowingSize) { - // Recycle histogram memory - if (new_used_size <= data_.size()) { - // no need to remove old node, just insert the new one. - nidx_map_[nidx] = used_size; - // memset histogram size in bytes - } else { - std::pair old_entry = *nidx_map_.begin(); - nidx_map_.erase(old_entry.first); - nidx_map_[nidx] = old_entry.second; + const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size(); + if (used_size >= kStopGrowingSize) { + // Use overflow + // Delete previous entries + overflow_nidx_map_.clear(); + overflow_.resize(HistogramSize() * new_nidxs.size()); + // Zero memory + auto d_data = overflow_.data().get(); + dh::LaunchN(overflow_.size(), + [=] __device__(size_t idx) { d_data[idx] = 0.0; }); + // Append new histograms + for (int nidx : new_nidxs) { + overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize(); } - // Zero recycled memory - auto d_data = data_.data().get() + nidx_map_[nidx]; - dh::LaunchN(n_bins_ * 2, - [=] __device__(size_t idx) { d_data[idx] = 0.0f; }); } else { - // Append new node histogram - nidx_map_[nidx] = used_size; - // Check there is enough memory for another histogram node - if (data_.size() < new_used_size + HistogramSize()) { - size_t new_required_memory = - std::max(data_.size() * 2, HistogramSize()); - data_.resize(new_required_memory); + CHECK_GE(data_.size(), used_size); + // Expand if necessary + if (data_.size() < new_used_size) { + data_.resize(std::max(data_.size() * 2, new_used_size)); + } + // Append new histograms + for (int nidx : new_nidxs) { + nidx_map_[nidx] = nidx_map_.size() * HistogramSize(); } } @@ -152,9 +156,16 @@ class DeviceHistogram { */ common::Span GetNodeHistogram(int nidx) { CHECK(this->HistogramExists(nidx)); - auto ptr = data_.data().get() + nidx_map_.at(nidx); - return common::Span( - reinterpret_cast(ptr), n_bins_); + + if (nidx_map_.find(nidx) != nidx_map_.cend()) { + // Fetch from normal cache + auto ptr = data_.data().get() + nidx_map_.at(nidx); + return common::Span(reinterpret_cast(ptr), n_bins_); + } else { + // Fetch from overflow + auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx); + return common::Span(reinterpret_cast(ptr), n_bins_); + } } }; @@ -171,7 +182,7 @@ struct GPUHistMakerDevice { BatchParam batch_param; std::unique_ptr row_partitioner; - DeviceHistogram hist{}; + DeviceHistogramStorage hist{}; dh::caching_device_vector d_gpair; // storage for gpair; common::Span gpair; @@ -322,7 +333,6 @@ struct GPUHistMakerDevice { } void BuildHist(int nidx) { - hist.AllocateHistogram(nidx); auto d_node_hist = hist.GetNodeHistogram(nidx); auto d_ridx = row_partitioner->GetRows(nidx); BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id), @@ -330,8 +340,12 @@ struct GPUHistMakerDevice { d_ridx, d_node_hist, histogram_rounding); } - void SubtractionTrick(int nidx_parent, int nidx_histogram, - int nidx_subtraction) { + // Attempt to do subtraction trick + // return true if succeeded + bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { + if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) { + return false; + } auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent); auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram); auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction); @@ -340,12 +354,7 @@ struct GPUHistMakerDevice { d_node_hist_subtraction[idx] = d_node_hist_parent[idx] - d_node_hist_histogram[idx]; }); - } - - bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) { - // Make sure histograms are already allocated - hist.AllocateHistogram(nidx_subtraction); - return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent); + return true; } void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) { @@ -505,13 +514,15 @@ struct GPUHistMakerDevice { row_partitioner.reset(); } - void AllReduceHist(int nidx, dh::AllReducer* reducer) { + // num histograms is the number of contiguous histograms in memory to reduce over + void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) { monitor.Start("AllReduce"); auto d_node_hist = hist.GetNodeHistogram(nidx).data(); - reducer->AllReduceSum( - reinterpret_cast(d_node_hist), - reinterpret_cast(d_node_hist), - page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT))); + reducer->AllReduceSum(reinterpret_cast(d_node_hist), + reinterpret_cast(d_node_hist), + page->Cuts().TotalBins() * + (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) * + num_histograms); monitor.Stop("AllReduce"); } @@ -519,33 +530,50 @@ struct GPUHistMakerDevice { /** * \brief Build GPU local histograms for the left and right child of some parent node */ - void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left, - int nidx_right, dh::AllReducer* reducer) { - auto build_hist_nidx = nidx_left; - auto subtraction_trick_nidx = nidx_right; - - // Decide whether to build the left histogram or right histogram - // Use sum of Hessian as a heuristic to select node with fewest training instances - bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess(); - if (fewer_right) { - std::swap(build_hist_nidx, subtraction_trick_nidx); + void BuildHistLeftRight(std::vector const& candidates, dh::AllReducer* reducer, + const RegTree& tree) { + if (candidates.empty()) return; + // Some nodes we will manually compute histograms + // others we will do by subtraction + std::vector hist_nidx; + std::vector subtraction_nidx; + for (auto& e : candidates) { + // Decide whether to build the left histogram or right histogram + // Use sum of Hessian as a heuristic to select node with fewest training instances + bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess(); + if (fewer_right) { + hist_nidx.emplace_back(tree[e.nid].RightChild()); + subtraction_nidx.emplace_back(tree[e.nid].LeftChild()); + } else { + hist_nidx.emplace_back(tree[e.nid].LeftChild()); + subtraction_nidx.emplace_back(tree[e.nid].RightChild()); + } + } + std::vector all_new = hist_nidx; + all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end()); + // Allocate the histograms + // Guaranteed contiguous memory + hist.AllocateHistograms(all_new); + + for (auto nidx : hist_nidx) { + this->BuildHist(nidx); } - this->BuildHist(build_hist_nidx); - this->AllReduceHist(build_hist_nidx, reducer); + // Reduce all in one go + // This gives much better latency in a distributed setting + // when processing a large batch + this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size()); - // Check whether we can use the subtraction trick to calculate the other - bool do_subtraction_trick = this->CanDoSubtractionTrick( - candidate.nid, build_hist_nidx, subtraction_trick_nidx); + for (int i = 0; i < subtraction_nidx.size(); i++) { + auto build_hist_nidx = hist_nidx.at(i); + auto subtraction_trick_nidx = subtraction_nidx.at(i); + auto parent_nidx = candidates.at(i).nid; - if (do_subtraction_trick) { - // Calculate other histogram using subtraction trick - this->SubtractionTrick(candidate.nid, build_hist_nidx, - subtraction_trick_nidx); - } else { - // Calculate other histogram manually - this->BuildHist(subtraction_trick_nidx); - this->AllReduceHist(subtraction_trick_nidx, reducer); + if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) { + // Calculate other histogram manually + this->BuildHist(subtraction_trick_nidx); + this->AllReduceHist(subtraction_trick_nidx, reducer, 1); + } } } @@ -605,8 +633,9 @@ struct GPUHistMakerDevice { GradientPairPrecise{}, thrust::plus{}); rabit::Allreduce(reinterpret_cast(&root_sum), 2); + hist.AllocateHistograms({kRootNIdx}); this->BuildHist(kRootNIdx); - this->AllReduceHist(kRootNIdx, reducer); + this->AllReduceHist(kRootNIdx, reducer, 1); // Remember root stats node_sum_gradients[kRootNIdx] = root_sum; @@ -646,6 +675,7 @@ struct GPUHistMakerDevice { std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set), [&](const auto& e) { return driver.IsChildValid(e); }); + auto new_candidates = pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); @@ -658,18 +688,12 @@ struct GPUHistMakerDevice { monitor.Stop("UpdatePosition"); } - for (auto i = 0ull; i < filtered_expand_set.size(); i++) { - auto candidate = expand_set.at(i); - int left_child_nidx = tree[candidate.nid].LeftChild(); - int right_child_nidx = tree[candidate.nid].RightChild(); - - monitor.Start("BuildHist"); - this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer); - monitor.Stop("BuildHist"); - } + monitor.Start("BuildHist"); + this->BuildHistLeftRight(filtered_expand_set, reducer, tree); + monitor.Stop("BuildHist"); for (auto i = 0ull; i < filtered_expand_set.size(); i++) { - auto candidate = expand_set.at(i); + auto candidate = filtered_expand_set.at(i); int left_child_nidx = tree[candidate.nid].LeftChild(); int right_child_nidx = tree[candidate.nid].RightChild(); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index 3b543a48d7cc..75d97b681a61 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -95,7 +95,6 @@ TEST(Histogram, GPUDeterministic) { std::vector shm_sizes{48 * 1024, 64 * 1024, 160 * 1024}; for (bool is_dense : is_dense_array) { for (int shm_size : shm_sizes) { - TestDeterministicHistogram(is_dense, shm_size); TestDeterministicHistogram(is_dense, shm_size); } } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index b3c08736c996..be51d3cc5e31 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -29,29 +29,38 @@ TEST(GpuHist, DeviceHistogram) { constexpr size_t kNBins = 128; constexpr size_t kNNodes = 4; constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; - DeviceHistogram histogram; + DeviceHistogramStorage histogram; histogram.Init(0, kNBins); - for (size_t i = 0; i < kNNodes; ++i) { - histogram.AllocateHistogram(i); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms({i}); } histogram.Reset(); ASSERT_EQ(histogram.Data().size(), kStopGrowing); // Use allocated memory but do not erase nidx_map. - for (size_t i = 0; i < kNNodes; ++i) { - histogram.AllocateHistogram(i); + for (int i = 0; i < kNNodes; ++i) { + histogram.AllocateHistograms({i}); } - for (size_t i = 0; i < kNNodes; ++i) { + for (int i = 0; i < kNNodes; ++i) { ASSERT_TRUE(histogram.HistogramExists(i)); } - // Erase existing nidx_map. - for (size_t i = kNNodes; i < kNNodes * 2; ++i) { - histogram.AllocateHistogram(i); - } - for (size_t i = 0; i < kNNodes; ++i) { - ASSERT_FALSE(histogram.HistogramExists(i)); + // Add two new nodes + histogram.AllocateHistograms({kNNodes}); + histogram.AllocateHistograms({kNNodes+1}); + + // Old cached nodes should still exist + for (int i = 0; i < kNNodes; ++i) { + ASSERT_TRUE(histogram.HistogramExists(i)); } + + // Should be deleted + ASSERT_FALSE(histogram.HistogramExists({kNNodes})); + // Most recent node should exist + ASSERT_TRUE(histogram.HistogramExists({kNNodes + 1})); + + // Add same node again - should fail + EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes+1});); } std::vector GetHostHistGpair() { @@ -96,9 +105,9 @@ void TestBuildHist(bool use_shared_memory_histograms) { thrust::host_vector h_gidx_buffer (page->gidx_buffer.HostVector()); maker.row_partitioner.reset(new RowPartitioner(0, kNRows)); - maker.hist.AllocateHistogram(0); + maker.hist.AllocateHistograms({0}); maker.gpair = gpair.DeviceSpan(); - maker.histogram_rounding = CreateRoundingFactor(maker.gpair);; + maker.histogram_rounding = CreateRoundingFactor(maker.gpair); BuildGradientHistogram( page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0), @@ -106,7 +115,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { maker.hist.GetNodeHistogram(0), maker.histogram_rounding, !use_shared_memory_histograms); - DeviceHistogram& d_hist = maker.hist; + DeviceHistogramStorage& d_hist = maker.hist; auto node_histogram = d_hist.GetNodeHistogram(0); // d_hist.data stored in float, not gradient pair @@ -129,12 +138,10 @@ void TestBuildHist(bool use_shared_memory_histograms) { TEST(GpuHist, BuildHistGlobalMem) { TestBuildHist(false); - TestBuildHist(false); } TEST(GpuHist, BuildHistSharedMem) { TestBuildHist(true); - TestBuildHist(true); } HistogramCutsWrapper GetHostCutMatrix () { @@ -198,7 +205,7 @@ TEST(GpuHist, EvaluateRootSplit) { // Initialize GPUHistMakerDevice::hist maker.hist.Init(0, (max_bins - 1) * kNCols); - maker.hist.AllocateHistogram(0); + maker.hist.AllocateHistograms({0}); // Each row of hist_gpair represents gpairs for one feature. // Each entry represents a bin. std::vector hist_gpair = GetHostHistGpair(); diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index 257085b0c8f9..8748ddcbdf91 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -3,7 +3,7 @@ import gc import pytest import xgboost as xgb -from hypothesis import given, strategies, assume, settings, note, reproduce_failure +from hypothesis import given, strategies, assume, settings, note sys.path.append("tests/python") import testing as tm From 1dd1a6cc1c74a45dcb986546f0fab753d359c70b Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 10 May 2022 05:07:31 -0700 Subject: [PATCH 11/64] Limit concurrent nodes --- src/tree/driver.h | 10 ++++++---- src/tree/updater_gpu_hist.cu | 5 ++++- tests/cpp/tree/gpu_hist/test_driver.cu | 18 +++++++++++++----- 3 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/tree/driver.h b/src/tree/driver.h index e61255e043c7..0aef93ccf9cd 100644 --- a/src/tree/driver.h +++ b/src/tree/driver.h @@ -33,10 +33,11 @@ class Driver { std::function>; public: - explicit Driver(TrainParam param) + explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256) : param_(param), - queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise : - LossGuide) {} + max_node_batch_size_(max_node_batch_size), + queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise + : LossGuide) {} template void Push(EntryIterT begin, EntryIterT end) { for (auto it = begin; it != end; ++it) { @@ -84,7 +85,7 @@ class Driver { std::vector result; ExpandEntryT e = queue_.top(); int level = e.depth; - while (e.depth == level && !queue_.empty()) { + while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) { queue_.pop(); if (e.IsValid(param_, num_leaves_)) { num_leaves_++; @@ -101,6 +102,7 @@ class Driver { private: TrainParam param_; std::size_t num_leaves_ = 1; + std::size_t max_node_batch_size_; ExpandQueue queue_; }; } // namespace tree diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 964a486baf16..eb10b42fc2fa 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -90,6 +90,7 @@ class DeviceHistogramStorage { sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); + public: // Start with about 16mb @@ -206,6 +207,7 @@ struct GPUHistMakerDevice { std::unique_ptr feature_groups; + GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page, common::Span _feature_types, bst_uint _n_rows, TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features, @@ -653,7 +655,8 @@ struct GPUHistMakerDevice { RegTree* p_tree, dh::AllReducer* reducer, HostDeviceVector* p_out_position) { auto& tree = *p_tree; - Driver driver(param); + // Process maximum 32 nodes at a time + Driver driver(param, 32); monitor.Start("Reset"); this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_); diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu index d7f8cc63869e..8e7164e37bec 100644 --- a/tests/cpp/tree/gpu_hist/test_driver.cu +++ b/tests/cpp/tree/gpu_hist/test_driver.cu @@ -8,8 +8,8 @@ namespace tree { TEST(GpuHist, DriverDepthWise) { TrainParam p; p.InitAllowUnknown(Args{}); - p.grow_policy=TrainParam::kDepthWise; - Driver driver(p); + p.grow_policy = TrainParam::kDepthWise; + Driver driver(p, 2); EXPECT_TRUE(driver.Pop().empty()); DeviceSplitCandidate split; split.loss_chg = 1.0f; @@ -20,15 +20,23 @@ TEST(GpuHist, DriverDepthWise) { EXPECT_EQ(driver.Pop().front().nid, 0); driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}}); driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}}); - driver.Push({GPUExpandEntry{3, 2, split, 2.0f, 1.0f, 1.0f}}); - // Should return entries from level 1 + driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}}); + driver.Push({GPUExpandEntry{4, 2, split, 2.0f, 1.0f, 1.0f}}); + // Should return 2 entries from level 1 + // as we limited the driver to pop maximum 2 nodes auto res = driver.Pop(); EXPECT_EQ(res.size(), 2); for (auto &e : res) { EXPECT_EQ(e.depth, 1); } + + // Should now return 1 entry from level 1 + res = driver.Pop(); + EXPECT_EQ(res.size(), 1); + EXPECT_EQ(res.at(0).depth, 1); + res = driver.Pop(); - EXPECT_EQ(res[0].depth, 2); + EXPECT_EQ(res.at(0).depth, 2); EXPECT_TRUE(driver.Pop().empty()); } From 8751d14956d3a85ef0aaef40f223cfe485539973 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 11 May 2022 04:17:36 -0700 Subject: [PATCH 12/64] Lint --- src/tree/updater_gpu_hist.cu | 11 ++++------- tests/cpp/tree/test_gpu_hist.cu | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index eb10b42fc2fa..88978142ee2e 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -88,9 +88,7 @@ class DeviceHistogramStorage { int device_id_; static constexpr size_t kNumItemsInGradientSum = sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT); - static_assert(kNumItemsInGradientSum == 2, - "Number of items in gradient type should be 2."); - + static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2."); public: // Start with about 16mb @@ -107,11 +105,10 @@ class DeviceHistogramStorage { overflow_nidx_map_.clear(); } bool HistogramExists(int nidx) const { - return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); - } - int Bins() const { - return n_bins_; + return nidx_map_.find(nidx) != nidx_map_.cend() || + overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend(); } + int Bins() const { return n_bins_; } size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; } dh::device_vector& Data() { return data_; } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index be51d3cc5e31..7d06d1731c5a 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -27,7 +27,7 @@ TEST(GpuHist, DeviceHistogram) { // Ensures that node allocates correctly after reaching `kStopGrowingSize`. dh::safe_cuda(cudaSetDevice(0)); constexpr size_t kNBins = 128; - constexpr size_t kNNodes = 4; + constexpr int kNNodes = 4; constexpr size_t kStopGrowing = kNNodes * kNBins * 2u; DeviceHistogramStorage histogram; histogram.Init(0, kNBins); @@ -47,7 +47,7 @@ TEST(GpuHist, DeviceHistogram) { // Add two new nodes histogram.AllocateHistograms({kNNodes}); - histogram.AllocateHistograms({kNNodes+1}); + histogram.AllocateHistograms({kNNodes + 1}); // Old cached nodes should still exist for (int i = 0; i < kNNodes; ++i) { From 49809bf2a700067fd28879a177b0a339d1395944 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 11 May 2022 08:11:02 -0700 Subject: [PATCH 13/64] Basic blockwise partitioning --- src/tree/gpu_hist/row_partitioner.cuh | 98 ++++++++++++++++--- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 43 ++++++++ 2 files changed, 127 insertions(+), 14 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 9470b6447512..2bba8fd51133 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -9,9 +9,52 @@ #include "xgboost/generic_parameters.h" #include "xgboost/task.h" #include "xgboost/tree_model.h" +#include namespace xgboost { namespace tree { + + /** \brief Used to demarcate a contiguous set of row indices associated with + * some tree node. */ +struct Segment { + size_t begin{0}; + size_t end{0}; + + Segment() = default; + + Segment(size_t begin, size_t end) : begin(begin), end(end) { CHECK_GE(end, begin); } + __host__ __device__ size_t Size() const { return end - begin; } +}; + +constexpr int kUpdatePositionMaxBatch = 32; +struct UpdatePositionBatchArgs { + bst_node_t nidx_batch[kUpdatePositionMaxBatch]; + bst_node_t left_nidx_batch[kUpdatePositionMaxBatch]; + bst_node_t right_nidx_batch[kUpdatePositionMaxBatch]; + Segment segments_batch[kUpdatePositionMaxBatch]; +}; + +template +__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args, + OpT op, common::Span ridx, + common::Span position, + common::Span left_counts) { + auto segment = args.segments_batch[blockIdx.x]; + auto ridx_segment = ridx.subspan(segment.begin, segment.Size()); + auto position_segment = position.subspan(segment.begin, segment.Size()); + thrust::sort_by_key(thrust::seq, ridx_segment.data(), ridx_segment.data()+ridx_segment.size(), + position_segment.data(), [=] __device__(auto a, auto b) { return op(a) < op(b); }); + + auto left_nidx = args.left_nidx_batch[blockIdx.x]; + int64_t left_count = 0; + for (int i = segment.begin; i < segment.end; i++) { + bst_node_t new_position = op(ridx[i]); // new node id + left_count += new_position == left_nidx; + position[i] = new_position; + } + left_counts[blockIdx.x] = left_count; +} + /*! \brief Count how many rows are assigned to left node. */ __forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment) { #if __CUDACC_VER_MAJOR__ > 8 @@ -36,7 +79,6 @@ __forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment class RowPartitioner { public: using RowIndexT = bst_uint; - struct Segment; static constexpr bst_node_t kIgnoredTreePosition = -1; private: @@ -98,6 +140,47 @@ class RowPartitioner { */ std::vector GetPositionHost(); + template + void UpdatePositionBatch(const std::vector& nidx, + const std::vector& left_nidx, + const std::vector& right_nidx, UpdatePositionOpT op) { + // Impose this limit because we are passing arguments for each node to the kernel by parameter + // this avoids memcpy but we cannot pass arbitrary number of arguments + CHECK_EQ(nidx.size(), left_nidx.size()); + CHECK_EQ(nidx.size(), right_nidx.size()); + CHECK_LE(nidx.size(), kUpdatePositionMaxBatch); + auto left_counts = pinned_.GetSpan(nidx.size(), 0); + + + // Prepare kernel arguments + UpdatePositionBatchArgs args; + std::copy(nidx.begin(),nidx.end(),args.nidx_batch); + std::copy(left_nidx.begin(),left_nidx.end(),args.left_nidx_batch); + std::copy(right_nidx.begin(),right_nidx.end(),args.right_nidx_batch); + for(int i = 0; i < nidx.size(); i++){ + args.segments_batch[i]=ridx_segments_.at(nidx[i]); + } + + // 1 block per node + UpdatePositionBatchKernel<<>>( + args, op, ridx_.CurrentSpan(), + position_.CurrentSpan(), left_counts); + + dh::safe_cuda(cudaDeviceSynchronize()); + + // Update segments + for (int i = 0; i < nidx.size(); i++) { + auto segment=ridx_segments_.at(nidx[i]); + auto left_count = left_counts[i]; + CHECK_LE(left_count, segment.Size()); + CHECK_GE(left_count, 0); + ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), + std::max(left_nidx[i], right_nidx[i]) + 1)); + ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count); + ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end); + } + } + /** * \brief Updates the tree position for set of training instances being split * into left and right child nodes. Accepts a user-defined lambda specifying @@ -215,19 +298,6 @@ class RowPartitioner { void SortPositionAndCopy(const Segment& segment, bst_node_t left_nidx, bst_node_t right_nidx, int64_t* d_left_count, cudaStream_t stream); - /** \brief Used to demarcate a contiguous set of row indices associated with - * some tree node. */ - struct Segment { - size_t begin { 0 }; - size_t end { 0 }; - - Segment() = default; - - Segment(size_t begin, size_t end) : begin(begin), end(end) { - CHECK_GE(end, begin); - } - size_t Size() const { return end - begin; } - }; }; }; // namespace tree }; // namespace xgboost diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index c8aaf82dcb3e..1e5da8a33e5c 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -106,6 +106,49 @@ void TestUpdatePosition() { TEST(RowPartitioner, Basic) { TestUpdatePosition(); } +void TestUpdatePositionBatch() { + const int kNumRows = 10; + RowPartitioner rp(0, kNumRows); + auto rows = rp.GetRowsHost(0); + EXPECT_EQ(rows.size(), kNumRows); + for (auto i = 0ull; i < kNumRows; i++) { + EXPECT_EQ(rows[i], i); + } + // Send the first five training instances to the right node + // and the second 5 to the left node + rp.UpdatePosition({0}, {1}, {2}, + [=] __device__(RowPartitioner::RowIndexT ridx) { + if (ridx > 4) { + return 1; + } + else { + return 2; + } + }); + rows = rp.GetRowsHost(1); + for (auto r : rows) { + EXPECT_GT(r, 4); + } + rows = rp.GetRowsHost(2); + for (auto r : rows) { + EXPECT_LT(r, 5); + } + + // Split the left node again + rp.UpdatePositionBatch({1}, {3}, {4}, [=] __device__(RowPartitioner::RowIndexT ridx) { + if (ridx < 7) { + return 3; + } + return 4; + }); + EXPECT_EQ(rp.GetRows(3).size(), 2); + EXPECT_EQ(rp.GetRows(4).size(), 3); + // Check position is as expected + EXPECT_EQ(rp.GetPositionHost(), std::vector({3,3,4,4,4,2,2,2,2,2})); +} + +TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); } + void TestFinalise() { const int kNumRows = 10; From 181d7cf2ddf8de96e0144d1378ca2e1b3268c225 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 12 May 2022 03:10:36 -0700 Subject: [PATCH 14/64] Working block partition --- tests/cpp/common/test_device_helpers.cu | 129 ++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 6e8668bd2581..b2d28a0b5320 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include "../../../src/common/device_helpers.cuh" @@ -264,4 +265,132 @@ void TestAtomicAdd() { TEST(AtomicAdd, Int64) { TestAtomicAdd(); } + +template +class BlockPartition { + public: + template + __device__ std::size_t Partition(IterT begin, IterT end, OpT op) { + typedef cub::BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp1, temp2; + __shared__ std::size_t lcomp[kBlockSize]; + __shared__ std::size_t rcomp[kBlockSize]; + + // Get left count + std::size_t left_count = 0; + if (threadIdx.x == 0) { + for (int i = 0; i < (end - begin); i++) { + left_count += op(begin[i]); + } + lcomp[0] = left_count; + } + __syncthreads(); + left_count = lcomp[0]; + // + + std::size_t loffset = 0, part = left_count, roffset = part; + auto count = end - begin; + std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0; + auto tid = threadIdx.x; + while (loffset < part && roffset < count) { + // find the samples in the left that belong to right and vice-versa + auto loff = loffset + tid, roff = roffset + tid; + if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0; + if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0; + // scan to compute the locations for each 'misfit' in the two partitions + std::size_t lidx, ridx; + BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); + BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); + __syncthreads(); + minlen = llen < rlen ? llen : rlen; + // compaction to figure out the right locations to swap + if (lflag) lcomp[lidx] = loff; + if (rflag) rcomp[ridx] = roff; + __syncthreads(); + // reset the appropriate flags for the longer of the two + if (lidx < minlen) lflag = 0; + if (ridx < minlen) rflag = 0; + if (llen == minlen) loffset += kBlockSize; + if (rlen == minlen) roffset += kBlockSize; + // swap the 'misfit's + if (tid < minlen) { + auto a = begin[lcomp[tid]]; + auto b = begin[rcomp[tid]]; + begin[lcomp[tid]] = b; + begin[rcomp[tid]] = a; + } + } + return left_count; + } +}; + +template +__global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) { + auto count = BlockPartition().Partition(begin, end, op); + if (threadIdx.x == 0) { + *count_out = count; + } +} + +template +void TestBlockPartition(thrust::device_vector& x) { + thrust::device_vector count(1); + + auto op = [] __device__(int y) { return y % 2 == 0; }; + TestBlockPartitionKernel + <<<1, kBlockSize>>>(x.data().get(), x.data().get() + x.size(), count.data().get(), op); + + auto reference = thrust::count_if(x.begin(), x.end(), op); + EXPECT_EQ(count[0], reference); + + auto left_partition_count = thrust::count_if(x.begin(), x.begin() + count[0], op); + EXPECT_EQ(count[0], left_partition_count); + auto right_partition_count = thrust::count_if(x.begin() + count[0], x.end(), op); + EXPECT_EQ(0, right_partition_count); +} + +TEST(BlockPartition, BlockPartitionEmpty) { + thrust::device_vector x; + TestBlockPartition<256>(x); +} + +TEST(BlockPartition, BlockPartitionUniform) { + thrust::device_vector x(100); + TestBlockPartition<256>(x); + thrust::fill(x.begin(),x.end(),1); + TestBlockPartition<256>(x); +} + +void MakeRandom(thrust::device_vector& x, int seed) { + auto counting = thrust::make_counting_iterator(0); + thrust::transform(counting, counting + x.size(), x.begin(), [=] __device__(auto idx) { + thrust::default_random_engine gen(seed); + thrust::uniform_int_distribution dist; + gen.discard(idx); + return dist(gen); + }); +} + +TEST(BlockPartition, BlockPartitionBasic) { + thrust::device_vector x = std::vector{0,1,2}; + TestBlockPartition<256>(x); +} + +TEST(BlockPartition, BlockPartition) { + int sizes[] = {1, 37, 1092}; + int seeds[] = {0, 1, 2, 3, 4}; + for (auto seed : seeds) { + for (auto size : sizes) { + thrust::device_vector x(size); + MakeRandom(x, seed); + thrust::device_vector y = x; + TestBlockPartition<1>(y); + y = x; + TestBlockPartition<1024>(y); + y = x; + TestBlockPartition<37>(y); + } + } +} + } // namespace xgboost From 666eb9b42d93e5f1fa49cc11cda2f38d12523414 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 12 May 2022 04:39:24 -0700 Subject: [PATCH 15/64] Reduction --- tests/cpp/common/test_device_helpers.cu | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index b2d28a0b5320..16805128b8c2 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -275,21 +275,24 @@ class BlockPartition { __shared__ typename BlockScanT::TempStorage temp1, temp2; __shared__ std::size_t lcomp[kBlockSize]; __shared__ std::size_t rcomp[kBlockSize]; + __shared__ int64_t tmp_sum; + + if (threadIdx.x == 0) { + tmp_sum = 0; + } + __syncthreads(); // Get left count + std::size_t count = end - begin; std::size_t left_count = 0; - if (threadIdx.x == 0) { - for (int i = 0; i < (end - begin); i++) { - left_count += op(begin[i]); - } - lcomp[0] = left_count; + for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) { + left_count += op(begin[idx]); } + atomicAdd(&tmp_sum, left_count); __syncthreads(); - left_count = lcomp[0]; - // + left_count = tmp_sum; std::size_t loffset = 0, part = left_count, roffset = part; - auto count = end - begin; std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0; auto tid = threadIdx.x; while (loffset < part && roffset < count) { From 66173c74fa674b899e7727aebc15e6f7b87a06e7 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 13 May 2022 04:30:24 -0700 Subject: [PATCH 16/64] Some failing tests --- src/common/device_helpers.cuh | 61 ++++++++++++++++++ src/tree/gpu_hist/row_partitioner.cuh | 31 +++++----- src/tree/updater_gpu_hist.cu | 62 ++++++++++++------- tests/cpp/common/test_device_helpers.cu | 62 +------------------ .../cpp/tree/gpu_hist/test_row_partitioner.cu | 8 +-- 5 files changed, 120 insertions(+), 104 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 334e3b4f89bf..20cb951e8805 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1639,4 +1639,65 @@ class CUDAStream { CUDAStreamView View() const { return CUDAStreamView{stream_}; } void Sync() { this->View().Sync(); } }; + +template +class BlockPartition { + public: + template + __device__ std::size_t Partition(IterT begin, IterT end, OpT op) { + typedef cub::BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp1, temp2; + __shared__ std::size_t lcomp[kBlockSize]; + __shared__ std::size_t rcomp[kBlockSize]; + __shared__ unsigned long long int tmp_sum; + + if (threadIdx.x == 0) { + tmp_sum = 0; + } + __syncthreads(); + + // Get left count + std::size_t count = end - begin; + std::size_t left_count = 0; + for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) { + left_count += op(begin[idx]); + } + atomicAdd(&tmp_sum, left_count); + __syncthreads(); + left_count = tmp_sum; + + std::size_t loffset = 0, part = left_count, roffset = part; + std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0; + auto tid = threadIdx.x; + while (loffset < part && roffset < count) { + // find the samples in the left that belong to right and vice-versa + auto loff = loffset + tid, roff = roffset + tid; + if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0; + if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0; + // scan to compute the locations for each 'misfit' in the two partitions + std::size_t lidx, ridx; + BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); + BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); + __syncthreads(); + minlen = llen < rlen ? llen : rlen; + // compaction to figure out the right locations to swap + if (lflag) lcomp[lidx] = loff; + if (rflag) rcomp[ridx] = roff; + __syncthreads(); + // reset the appropriate flags for the longer of the two + if (lidx < minlen) lflag = 0; + if (ridx < minlen) rflag = 0; + if (llen == minlen) loffset += kBlockSize; + if (rlen == minlen) roffset += kBlockSize; + // swap the 'misfit's + if (tid < minlen) { + auto a = begin[lcomp[tid]]; + auto b = begin[rcomp[tid]]; + begin[lcomp[tid]] = b; + begin[rcomp[tid]] = a; + } + } + return left_count; + } +}; } // namespace dh diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 2bba8fd51133..7e5cbf90d9b1 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -27,32 +27,32 @@ struct Segment { }; constexpr int kUpdatePositionMaxBatch = 32; +template struct UpdatePositionBatchArgs { bst_node_t nidx_batch[kUpdatePositionMaxBatch]; bst_node_t left_nidx_batch[kUpdatePositionMaxBatch]; bst_node_t right_nidx_batch[kUpdatePositionMaxBatch]; Segment segments_batch[kUpdatePositionMaxBatch]; + OpDataT data_batch[kUpdatePositionMaxBatch]; }; -template -__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args, +template +__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args, OpT op, common::Span ridx, common::Span position, common::Span left_counts) { auto segment = args.segments_batch[blockIdx.x]; + auto data = args.data_batch[blockIdx.x]; auto ridx_segment = ridx.subspan(segment.begin, segment.Size()); auto position_segment = position.subspan(segment.begin, segment.Size()); - thrust::sort_by_key(thrust::seq, ridx_segment.data(), ridx_segment.data()+ridx_segment.size(), - position_segment.data(), [=] __device__(auto a, auto b) { return op(a) < op(b); }); auto left_nidx = args.left_nidx_batch[blockIdx.x]; - int64_t left_count = 0; - for (int i = segment.begin; i < segment.end; i++) { - bst_node_t new_position = op(ridx[i]); // new node id - left_count += new_position == left_nidx; - position[i] = new_position; + auto left_count = dh::BlockPartition().Partition( + ridx_segment.begin(), ridx_segment.end(), [=] __device__(auto e) { return op(e, data) == left_nidx; }); + + if (threadIdx.x == 0) { + left_counts[blockIdx.x] = left_count; } - left_counts[blockIdx.x] = left_count; } /*! \brief Count how many rows are assigned to left node. */ @@ -140,10 +140,11 @@ class RowPartitioner { */ std::vector GetPositionHost(); - template + template void UpdatePositionBatch(const std::vector& nidx, const std::vector& left_nidx, - const std::vector& right_nidx, UpdatePositionOpT op) { + const std::vector& right_nidx, + const std::vector& op_data, UpdatePositionOpT op) { // Impose this limit because we are passing arguments for each node to the kernel by parameter // this avoids memcpy but we cannot pass arbitrary number of arguments CHECK_EQ(nidx.size(), left_nidx.size()); @@ -153,16 +154,18 @@ class RowPartitioner { // Prepare kernel arguments - UpdatePositionBatchArgs args; + UpdatePositionBatchArgs args; std::copy(nidx.begin(),nidx.end(),args.nidx_batch); std::copy(left_nidx.begin(),left_nidx.end(),args.left_nidx_batch); std::copy(right_nidx.begin(),right_nidx.end(),args.right_nidx_batch); + std::copy(op_data.begin(),op_data.end(),args.data_batch); for(int i = 0; i < nidx.size(); i++){ args.segments_batch[i]=ridx_segments_.at(nidx[i]); } // 1 block per node - UpdatePositionBatchKernel<<>>( + constexpr int kBlockSize = 512; + UpdatePositionBatchKernel<<>>( args, op, ridx_.CurrentSpan(), position_.CurrentSpan(), left_counts); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 88978142ee2e..f497c84726e2 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -356,33 +356,49 @@ struct GPUHistMakerDevice { return true; } - void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) { - RegTree::Node split_node = (*p_tree)[e.nid]; - auto split_type = p_tree->NodeSplitType(e.nid); - auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); - auto node_cats = e.split.split_cats.Bits(); + // Extra data for each node that is used + // in the update position function + struct NodeSplitData { + RegTree::Node split_node; + FeatureType split_type; + common::CatBitField node_cats; + }; + + void UpdatePosition(const std::vector& candidates, RegTree* p_tree) { + std::vector nidx(candidates.size()); + std::vector left_nidx(candidates.size()); + std::vector right_nidx(candidates.size()); + std::vector split_data(candidates.size()); + for (int i = 0; i < candidates.size(); i++) { + auto& e = candidates[i]; + RegTree::Node split_node = (*p_tree)[e.nid]; + auto split_type = p_tree->NodeSplitType(e.nid); + nidx.at(i) = e.nid; + left_nidx.at(i) = split_node.LeftChild(); + right_nidx.at(i) = split_node.RightChild(); + split_data.at(i) = NodeSplitData{ split_node, split_type, e.split.split_cats }; + } - row_partitioner->UpdatePosition( - e.nid, split_node.LeftChild(), split_node.RightChild(), - [=] __device__(bst_uint ridx) { + auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); + row_partitioner->UpdatePositionBatch( + nidx, left_nidx, right_nidx, split_data, [=] __device__(bst_uint ridx, const NodeSplitData& data) { // given a row index, returns the node id it belongs to - bst_float cut_value = - d_matrix.GetFvalue(ridx, split_node.SplitIndex()); + bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex()); // Missing value bst_node_t new_position = 0; if (isnan(cut_value)) { - new_position = split_node.DefaultChild(); + new_position = data.split_node.DefaultChild(); } else { bool go_left = true; - if (split_type == FeatureType::kCategorical) { - go_left = common::Decision(node_cats, cut_value, split_node.DefaultLeft()); + if (data.split_type == FeatureType::kCategorical) { + go_left = common::Decision(data.node_cats.Bits(), cut_value, data.split_node.DefaultLeft()); } else { - go_left = cut_value <= split_node.SplitCond(); + go_left = cut_value <= data.split_node.SplitCond(); } if (go_left) { - new_position = split_node.LeftChild(); + new_position = data.split_node.LeftChild(); } else { - new_position = split_node.RightChild(); + new_position = data.split_node.RightChild(); } } return new_position; @@ -679,14 +695,12 @@ struct GPUHistMakerDevice { auto new_candidates = pinned.GetSpan(filtered_expand_set.size() * 2, GPUExpandEntry()); - for (const auto& e : filtered_expand_set) { - monitor.Start("UpdatePosition"); - // Update position is only run when child is valid, instead of right after apply - // split (as in approx tree method). Hense we have the finalise position call - // in GPU Hist. - this->UpdatePosition(e, p_tree); - monitor.Stop("UpdatePosition"); - } + monitor.Start("UpdatePosition"); + // Update position is only run when child is valid, instead of right after apply + // split (as in approx tree method). Hense we have the finalise position call + // in GPU Hist. + this->UpdatePosition(filtered_expand_set, p_tree); + monitor.Stop("UpdatePosition"); monitor.Start("BuildHist"); this->BuildHistLeftRight(filtered_expand_set, reducer, tree); diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 16805128b8c2..ec9d3af45a45 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -266,70 +266,10 @@ TEST(AtomicAdd, Int64) { TestAtomicAdd(); } -template -class BlockPartition { - public: - template - __device__ std::size_t Partition(IterT begin, IterT end, OpT op) { - typedef cub::BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp1, temp2; - __shared__ std::size_t lcomp[kBlockSize]; - __shared__ std::size_t rcomp[kBlockSize]; - __shared__ int64_t tmp_sum; - - if (threadIdx.x == 0) { - tmp_sum = 0; - } - __syncthreads(); - - // Get left count - std::size_t count = end - begin; - std::size_t left_count = 0; - for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) { - left_count += op(begin[idx]); - } - atomicAdd(&tmp_sum, left_count); - __syncthreads(); - left_count = tmp_sum; - - std::size_t loffset = 0, part = left_count, roffset = part; - std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0; - auto tid = threadIdx.x; - while (loffset < part && roffset < count) { - // find the samples in the left that belong to right and vice-versa - auto loff = loffset + tid, roff = roffset + tid; - if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0; - if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0; - // scan to compute the locations for each 'misfit' in the two partitions - std::size_t lidx, ridx; - BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); - BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); - __syncthreads(); - minlen = llen < rlen ? llen : rlen; - // compaction to figure out the right locations to swap - if (lflag) lcomp[lidx] = loff; - if (rflag) rcomp[ridx] = roff; - __syncthreads(); - // reset the appropriate flags for the longer of the two - if (lidx < minlen) lflag = 0; - if (ridx < minlen) rflag = 0; - if (llen == minlen) loffset += kBlockSize; - if (rlen == minlen) roffset += kBlockSize; - // swap the 'misfit's - if (tid < minlen) { - auto a = begin[lcomp[tid]]; - auto b = begin[rcomp[tid]]; - begin[lcomp[tid]] = b; - begin[rcomp[tid]] = a; - } - } - return left_count; - } -}; template __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) { - auto count = BlockPartition().Partition(begin, end, op); + auto count = dh::BlockPartition().Partition(begin, end, op); if (threadIdx.x == 0) { *count_out = count; } diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 1e5da8a33e5c..2314a622f9e1 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -114,10 +114,10 @@ void TestUpdatePositionBatch() { for (auto i = 0ull; i < kNumRows; i++) { EXPECT_EQ(rows[i], i); } + std::vector extra_data = {0}; // Send the first five training instances to the right node // and the second 5 to the left node - rp.UpdatePosition({0}, {1}, {2}, - [=] __device__(RowPartitioner::RowIndexT ridx) { + rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) { if (ridx > 4) { return 1; } @@ -135,7 +135,7 @@ void TestUpdatePositionBatch() { } // Split the left node again - rp.UpdatePositionBatch({1}, {3}, {4}, [=] __device__(RowPartitioner::RowIndexT ridx) { + rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) { if (ridx < 7) { return 3; } @@ -143,8 +143,6 @@ void TestUpdatePositionBatch() { }); EXPECT_EQ(rp.GetRows(3).size(), 2); EXPECT_EQ(rp.GetRows(4).size(), 3); - // Check position is as expected - EXPECT_EQ(rp.GetPositionHost(), std::vector({3,3,4,4,4,2,2,2,2,2})); } TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); } From ec7fea889a83f9ca479b7f4f67b7302497007dd1 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 13 May 2022 05:41:20 -0700 Subject: [PATCH 17/64] Handle empty candidate --- src/tree/gpu_hist/row_partitioner.cuh | 3 ++- src/tree/updater_gpu_hist.cu | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 7e5cbf90d9b1..9fb8635d0b57 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -145,14 +145,15 @@ class RowPartitioner { const std::vector& left_nidx, const std::vector& right_nidx, const std::vector& op_data, UpdatePositionOpT op) { + if (nidx.empty()) return; // Impose this limit because we are passing arguments for each node to the kernel by parameter // this avoids memcpy but we cannot pass arbitrary number of arguments CHECK_EQ(nidx.size(), left_nidx.size()); CHECK_EQ(nidx.size(), right_nidx.size()); + CHECK_EQ(nidx.size(), op_data.size()); CHECK_LE(nidx.size(), kUpdatePositionMaxBatch); auto left_counts = pinned_.GetSpan(nidx.size(), 0); - // Prepare kernel arguments UpdatePositionBatchArgs args; std::copy(nidx.begin(),nidx.end(),args.nidx_batch); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index f497c84726e2..3905cd233aac 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -356,8 +356,8 @@ struct GPUHistMakerDevice { return true; } - // Extra data for each node that is used - // in the update position function + // Extra data for each node that is passed + // to the update position function struct NodeSplitData { RegTree::Node split_node; FeatureType split_type; @@ -365,6 +365,7 @@ struct GPUHistMakerDevice { }; void UpdatePosition(const std::vector& candidates, RegTree* p_tree) { + if (candidates.empty()) return; std::vector nidx(candidates.size()); std::vector left_nidx(candidates.size()); std::vector right_nidx(candidates.size()); From 49c5f90aaa300bb3d844801b2d7bf2167d740108 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 13 May 2022 06:03:17 -0700 Subject: [PATCH 18/64] Cleanup --- src/tree/gpu_hist/row_partitioner.cu | 119 +----------------- src/tree/gpu_hist/row_partitioner.cuh | 116 ++--------------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 98 --------------- 3 files changed, 14 insertions(+), 319 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 9e002f77b64c..8fbded53f913 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -10,73 +10,6 @@ namespace xgboost { namespace tree { -struct IndexFlagTuple { - size_t idx; - size_t flag; -}; - -struct IndexFlagOp { - __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, - const IndexFlagTuple& b) const { - return {b.idx, a.flag + b.flag}; - } -}; - -struct WriteResultsFunctor { - bst_node_t left_nidx; - common::Span position_in; - common::Span position_out; - common::Span ridx_in; - common::Span ridx_out; - int64_t* d_left_count; - - __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { - // the ex_scan_result represents how many rows have been assigned to left - // node so far during scan. - int scatter_address; - if (position_in[x.idx] == left_nidx) { - scatter_address = x.flag - 1; // -1 because inclusive scan - } else { - // current number of rows belong to right node + total number of rows - // belong to left node - scatter_address = (x.idx - x.flag) + *d_left_count; - } - // copy the node id to output - position_out[scatter_address] = position_in[x.idx]; - ridx_out[scatter_address] = ridx_in[x.idx]; - - // Discard - return {}; - } -}; - -// Implement partitioning via single scan operation using transform output to -// write the result -void RowPartitioner::SortPosition(common::Span position, - common::Span position_out, - common::Span ridx, - common::Span ridx_out, - bst_node_t left_nidx, bst_node_t, - int64_t* d_left_count, cudaStream_t stream) { - WriteResultsFunctor write_results{left_nidx, position, position_out, - ridx, ridx_out, d_left_count}; - auto discard_write_iterator = - thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); - auto counting = thrust::make_counting_iterator(0llu); - auto input_iterator = dh::MakeTransformIterator( - counting, [=] __device__(size_t idx) { - return IndexFlagTuple{idx, static_cast(position[idx] == left_nidx)}; - }); - size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp(), - position.size(), stream); - dh::TemporaryArray temp(temp_bytes); - cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp(), - position.size(), stream); -} - void Reset(int device_idx, common::Span ridx, common::Span position) { CHECK_EQ(ridx.size(), position.size()); @@ -87,26 +20,11 @@ void Reset(int device_idx, common::Span ridx, } RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_a_(num_rows), position_a_(num_rows), - ridx_b_(num_rows), position_b_(num_rows) { + : device_idx_(device_idx), ridx_(num_rows), position_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); - ridx_ = dh::DoubleBuffer{&ridx_a_, &ridx_b_}; - position_ = dh::DoubleBuffer{&position_a_, &position_b_}; ridx_segments_.emplace_back(Segment(0, num_rows)); - Reset(device_idx, ridx_.CurrentSpan(), position_.CurrentSpan()); - left_counts_.resize(256); - thrust::fill(left_counts_.begin(), left_counts_.end(), 0); - streams_.resize(2); - for (auto& stream : streams_) { - dh::safe_cuda(cudaStreamCreate(&stream)); - } -} -RowPartitioner::~RowPartitioner() { - dh::safe_cuda(cudaSetDevice(device_idx_)); - for (auto& stream : streams_) { - dh::safe_cuda(cudaStreamDestroy(stream)); - } + Reset(device_idx, dh::ToSpan(ridx_), dh::ToSpan(position_)); } common::Span RowPartitioner::GetRows( @@ -117,15 +35,15 @@ common::Span RowPartitioner::GetRows( if (segment.Size() == 0) { return {}; } - return ridx_.CurrentSpan().subspan(segment.begin, segment.Size()); + return dh::ToSpan(ridx_).subspan(segment.begin, segment.Size()); } common::Span RowPartitioner::GetRows() { - return ridx_.CurrentSpan(); + return dh::ToSpan(ridx_); } common::Span RowPartitioner::GetPosition() { - return position_.CurrentSpan(); + return dh::ToSpan(position_); } std::vector RowPartitioner::GetRowsHost( bst_node_t nidx) { @@ -142,32 +60,5 @@ std::vector RowPartitioner::GetPositionHost() { return position; } -void RowPartitioner::SortPositionAndCopy(const Segment& segment, - bst_node_t left_nidx, - bst_node_t right_nidx, - int64_t* d_left_count, - cudaStream_t stream) { - SortPosition( - // position_in - common::Span(position_.Current() + segment.begin, - segment.Size()), - // position_out - common::Span(position_.Other() + segment.begin, - segment.Size()), - // row index in - common::Span(ridx_.Current() + segment.begin, segment.Size()), - // row index out - common::Span(ridx_.Other() + segment.begin, segment.Size()), - left_nidx, right_nidx, d_left_count, stream); - // Copy back key/value - const auto d_position_current = position_.Current() + segment.begin; - const auto d_position_other = position_.Other() + segment.begin; - const auto d_ridx_current = ridx_.Current() + segment.begin; - const auto d_ridx_other = ridx_.Other() + segment.begin; - dh::LaunchN(segment.Size(), stream, [=] __device__(size_t idx) { - d_position_current[idx] = d_position_other[idx]; - d_ridx_current[idx] = d_ridx_other[idx]; - }); -} }; // namespace tree }; // namespace xgboost diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 9fb8635d0b57..c6b9b763ff1d 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -55,25 +55,6 @@ __global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args, } } -/*! \brief Count how many rows are assigned to left node. */ -__forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment) { -#if __CUDACC_VER_MAJOR__ > 8 - int mask = __activemask(); - unsigned ballot = __ballot_sync(mask, increment); - int leader = __ffs(mask) - 1; - if (threadIdx.x % 32 == leader) { - atomicAdd(reinterpret_cast(d_count), // NOLINT - static_cast(__popc(ballot))); // NOLINT - } -#else - unsigned ballot = __ballot(increment); - if (threadIdx.x % 32 == 0) { - atomicAdd(reinterpret_cast(d_count), // NOLINT - static_cast(__popc(ballot))); // NOLINT - } -#endif -} - /** \brief Class responsible for tracking subsets of rows as we add splits and * partition training rows into different leaf nodes. */ class RowPartitioner { @@ -92,26 +73,18 @@ class RowPartitioner { */ /*! \brief Range of row index for each node, pointers into ridx below. */ std::vector ridx_segments_; - dh::TemporaryArray ridx_a_; - dh::TemporaryArray ridx_b_; - dh::TemporaryArray position_a_; - dh::TemporaryArray position_b_; /*! \brief mapping for node id -> rows. * This looks like: * node id | 1 | 2 | * rows idx | 3, 5, 1 | 13, 31 | */ - dh::DoubleBuffer ridx_; + dh::TemporaryArray ridx_; /*! \brief mapping for row -> node id. */ - dh::DoubleBuffer position_; - dh::caching_device_vector - left_counts_; // Useful to keep a bunch of zeroed memory for sort position - std::vector streams_; + dh::TemporaryArray position_; dh::PinnedMemory pinned_; public: RowPartitioner(int device_idx, size_t num_rows); - ~RowPartitioner(); RowPartitioner(const RowPartitioner&) = delete; RowPartitioner& operator=(const RowPartitioner&) = delete; @@ -167,8 +140,8 @@ class RowPartitioner { // 1 block per node constexpr int kBlockSize = 512; UpdatePositionBatchKernel<<>>( - args, op, ridx_.CurrentSpan(), - position_.CurrentSpan(), left_counts); + args, op, dh::ToSpan(ridx_), + dh::ToSpan(position_), left_counts); dh::safe_cuda(cudaDeviceSynchronize()); @@ -185,60 +158,6 @@ class RowPartitioner { } } - /** - * \brief Updates the tree position for set of training instances being split - * into left and right child nodes. Accepts a user-defined lambda specifying - * which branch each training instance should go down. - * - * \tparam UpdatePositionOpT - * \param nidx The index of the node being split. - * \param left_nidx The left child index. - * \param right_nidx The right child index. - * \param op Device lambda. Should provide the row index as an - * argument and return the new position for this training instance. - */ - template - void UpdatePosition(bst_node_t nidx, bst_node_t left_nidx, - bst_node_t right_nidx, UpdatePositionOpT op) { - Segment segment = ridx_segments_.at(nidx); // rows belongs to node nidx - auto d_ridx = ridx_.CurrentSpan(); - auto d_position = position_.CurrentSpan(); - if (left_counts_.size() <= nidx) { - left_counts_.resize((nidx * 2) + 1); - thrust::fill(left_counts_.begin(), left_counts_.end(), 0); - } - // Now we divide the row segment into left and right node. - - int64_t* d_left_count = left_counts_.data().get() + nidx; - // Launch 1 thread for each row - dh::LaunchN<1, 128>(segment.Size(), [segment, op, left_nidx, right_nidx, d_ridx, d_left_count, - d_position] __device__(size_t idx) { - // LaunchN starts from zero, so we restore the row index by adding segment.begin - idx += segment.begin; - RowIndexT ridx = d_ridx[idx]; - bst_node_t new_position = op(ridx); // new node id - KERNEL_CHECK(new_position == left_nidx || new_position == right_nidx); - AtomicIncrement(d_left_count, new_position == left_nidx); - d_position[idx] = new_position; - }); - // Overlap device to host memory copy (left_count) with sort - int64_t &left_count = pinned_.GetSpan(1)[0]; - dh::safe_cuda(cudaMemcpyAsync(&left_count, d_left_count, sizeof(int64_t), - cudaMemcpyDeviceToHost, streams_[0])); - - SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]); - - dh::safe_cuda(cudaStreamSynchronize(streams_[0])); - CHECK_LE(left_count, segment.Size()); - CHECK_GE(left_count, 0); - ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), - std::max(left_nidx, right_nidx) + 1)); - ridx_segments_[left_nidx] = - Segment(segment.begin, segment.begin + left_count); - ridx_segments_[right_nidx] = - Segment(segment.begin + left_count, segment.end); - } - /** * \brief Finalise the position of all training instances after tree construction is * complete. Does not update any other meta information in this data structure, so @@ -256,10 +175,10 @@ class RowPartitioner { void FinalisePosition(Context const* ctx, ObjInfo task, HostDeviceVector* p_out_position, FinalisePositionOpT op, Sampledp sampledp) { - auto d_position = position_.Current(); - const auto d_ridx = ridx_.Current(); + auto d_position = position_.data().get(); + const auto d_ridx = ridx_.data().get(); if (!task.UpdateTreeLeaf()) { - dh::LaunchN(position_.Size(), [=] __device__(size_t idx) { + dh::LaunchN(position_.size(), [=] __device__(size_t idx) { auto position = d_position[idx]; RowIndexT ridx = d_ridx[idx]; bst_node_t new_position = op(ridx, position); @@ -272,9 +191,9 @@ class RowPartitioner { } p_out_position->SetDevice(ctx->gpu_id); - p_out_position->Resize(position_.Size()); + p_out_position->Resize(position_.size()); auto sorted_position = p_out_position->DevicePointer(); - dh::LaunchN(position_.Size(), [=] __device__(size_t idx) { + dh::LaunchN(position_.size(), [=] __device__(size_t idx) { auto position = d_position[idx]; RowIndexT ridx = d_ridx[idx]; bst_node_t new_position = op(ridx, position); @@ -285,23 +204,6 @@ class RowPartitioner { d_position[idx] = new_position; }); } - - /** - * \brief Optimised routine for sorting key value pairs into left and right - * segments. Based on a single pass of exclusive scan, uses iterators to - * redirect inputs and outputs. - */ - void SortPosition(common::Span position, - common::Span position_out, - common::Span ridx, - common::Span ridx_out, bst_node_t left_nidx, - bst_node_t right_nidx, int64_t* d_left_count, - cudaStream_t stream = nullptr); - - /*! \brief Sort row indices according to position. */ - void SortPositionAndCopy(const Segment& segment, bst_node_t left_nidx, - bst_node_t right_nidx, int64_t* d_left_count, - cudaStream_t stream); }; }; // namespace tree }; // namespace xgboost diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 2314a622f9e1..e4e5c9dacb60 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -19,93 +19,6 @@ namespace xgboost { namespace tree { -void TestSortPosition(const std::vector& position_in, int left_idx, - int right_idx) { - dh::safe_cuda(cudaSetDevice(0)); - std::vector left_count = { - std::count(position_in.begin(), position_in.end(), left_idx)}; - dh::caching_device_vector d_left_count = left_count; - dh::caching_device_vector position = position_in; - dh::caching_device_vector position_out(position.size()); - - dh::caching_device_vector ridx(position.size()); - thrust::sequence(ridx.begin(), ridx.end()); - dh::caching_device_vector ridx_out(ridx.size()); - RowPartitioner rp(0,10); - rp.SortPosition( - common::Span(position.data().get(), position.size()), - common::Span(position_out.data().get(), position_out.size()), - common::Span(ridx.data().get(), ridx.size()), - common::Span(ridx_out.data().get(), ridx_out.size()), left_idx, - right_idx, d_left_count.data().get(), nullptr); - thrust::host_vector position_result = position_out; - thrust::host_vector ridx_result = ridx_out; - - // Check position is sorted - EXPECT_TRUE(std::is_sorted(position_result.begin(), position_result.end())); - // Check row indices are sorted inside left and right segment - EXPECT_TRUE( - std::is_sorted(ridx_result.begin(), ridx_result.begin() + left_count[0])); - EXPECT_TRUE( - std::is_sorted(ridx_result.begin() + left_count[0], ridx_result.end())); - - // Check key value pairs are the same - for (auto i = 0ull; i < ridx_result.size(); i++) { - EXPECT_EQ(position_result[i], position_in[ridx_result[i]]); - } -} -TEST(GpuHist, SortPosition) { - TestSortPosition({1, 2, 1, 2, 1}, 1, 2); - TestSortPosition({1, 1, 1, 1}, 1, 2); - TestSortPosition({2, 2, 2, 2}, 1, 2); - TestSortPosition({1, 2, 1, 2, 3}, 1, 2); -} - -void TestUpdatePosition() { - const int kNumRows = 10; - RowPartitioner rp(0, kNumRows); - auto rows = rp.GetRowsHost(0); - EXPECT_EQ(rows.size(), kNumRows); - for (auto i = 0ull; i < kNumRows; i++) { - EXPECT_EQ(rows[i], i); - } - // Send the first five training instances to the right node - // and the second 5 to the left node - rp.UpdatePosition(0, 1, 2, - [=] __device__(RowPartitioner::RowIndexT ridx) { - if (ridx > 4) { - return 1; - } - else { - return 2; - } - }); - rows = rp.GetRowsHost(1); - for (auto r : rows) { - EXPECT_GT(r, 4); - } - rows = rp.GetRowsHost(2); - for (auto r : rows) { - EXPECT_LT(r, 5); - } - - // Split the left node again - rp.UpdatePosition(1, 3, 4, [=]__device__(RowPartitioner::RowIndexT ridx) - { - if (ridx < 7) { - return 3 - ; - } - return 4; - }); - EXPECT_EQ(rp.GetRows(3).size(), 2); - EXPECT_EQ(rp.GetRows(4).size(), 3); - // Check position is as expected - EXPECT_EQ(rp.GetPositionHost(), std::vector({3,3,4,4,4,2,2,2,2,2})); -} - -TEST(RowPartitioner, Basic) { TestUpdatePosition(); } - void TestUpdatePositionBatch() { const int kNumRows = 10; RowPartitioner rp(0, kNumRows); @@ -203,16 +116,5 @@ void TestFinalise() { TEST(RowPartitioner, Finalise) { TestFinalise(); } -void TestIncorrectRow() { - RowPartitioner rp(0, 1); - rp.UpdatePosition(0, 1, 2, [=]__device__ (RowPartitioner::RowIndexT ridx) - { - return 4; // This is not the left branch or the right branch - }); -} - -TEST(RowPartitionerDeathTest, IncorrectRow) { - ASSERT_DEATH({ TestIncorrectRow(); },".*"); -} } // namespace tree } // namespace xgboost From bd480822f9127b2420c6d24ee4462d56acbbb4b6 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sat, 14 May 2022 06:55:46 -0700 Subject: [PATCH 19/64] experiments --- src/tree/gpu_hist/row_partitioner.cuh | 2 +- tests/cpp/common/test_device_helpers.cu | 74 ++++++++++++++++++++++++- 2 files changed, 74 insertions(+), 2 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index c6b9b763ff1d..e779ced8dfdf 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -48,7 +48,7 @@ __global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args, auto left_nidx = args.left_nidx_batch[blockIdx.x]; auto left_count = dh::BlockPartition().Partition( - ridx_segment.begin(), ridx_segment.end(), [=] __device__(auto e) { return op(e, data) == left_nidx; }); + ridx_segment.data(), ridx_segment.data()+ridx_segment.size(), [=] __device__(auto e) { return op(e, data) == left_nidx; }); if (threadIdx.x == 0) { left_counts[blockIdx.x] = left_count; diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index ec9d3af45a45..7ea6cdb1a316 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -266,10 +266,76 @@ TEST(AtomicAdd, Int64) { TestAtomicAdd(); } +template +class BlockPartitionTune { + public: + template + __device__ std::size_t Partition(IterT begin, IterT end, OpT op) { + typedef cub::BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp1, temp2; + __shared__ std::size_t lcomp[kBlockSize]; + __shared__ std::size_t rcomp[kBlockSize]; + + /* + __shared__ unsigned long long int tmp_sum; + + if (threadIdx.x == 0) { + tmp_sum = 0; + } + __syncthreads(); + + // Get left count + std::size_t count = end - begin; + std::size_t left_count = 0; + for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) { + left_count += op(begin[idx]); + } + atomicAdd(&tmp_sum, left_count); + __syncthreads(); + left_count = tmp_sum; + */ + std::size_t count = end - begin; + std::size_t left_count = count/2; + + std::size_t loffset = 0, part = left_count, roffset = part; + std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0; + auto tid = threadIdx.x; + while (loffset < part && roffset < count) { + // find the samples in the left that belong to right and vice-versa + auto loff = loffset + tid, roff = roffset + tid; + if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0; + if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0; + // scan to compute the locations for each 'misfit' in the two partitions + std::size_t lidx, ridx; + BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); + BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); + //__syncthreads(); + minlen = llen < rlen ? llen : rlen; + // compaction to figure out the right locations to swap + if (lflag) lcomp[lidx] = loff; + if (rflag) rcomp[ridx] = roff; + __syncthreads(); + // reset the appropriate flags for the longer of the two + if (lidx < minlen) lflag = 0; + if (ridx < minlen) rflag = 0; + if (llen == minlen) loffset += kBlockSize; + if (rlen == minlen) roffset += kBlockSize; + // swap the 'misfit's + if (tid < minlen) { + auto a = begin[lcomp[tid]]; + auto b = begin[rcomp[tid]]; + begin[lcomp[tid]] = b; + begin[rcomp[tid]] = a; + } + } + return left_count; + } +}; + template __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) { - auto count = dh::BlockPartition().Partition(begin, end, op); + auto count = BlockPartitionTune().Partition(begin, end, op); if (threadIdx.x == 0) { *count_out = count; } @@ -336,4 +402,10 @@ TEST(BlockPartition, BlockPartition) { } } +TEST(BlockPartition, BlockPartitionBenchmark) { + thrust::device_vector x(10000000); + thrust::sequence(x. begin(),x.end()); + TestBlockPartition<1024>(x); +} + } // namespace xgboost From c3ef1f66e05e687ecded030587933c750cf1db6b Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sat, 14 May 2022 15:13:29 -0700 Subject: [PATCH 20/64] Improvements --- tests/cpp/common/test_device_helpers.cu | 111 ++++++++++++++++++------ 1 file changed, 85 insertions(+), 26 deletions(-) diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 7ea6cdb1a316..175e57b54062 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -266,18 +266,18 @@ TEST(AtomicAdd, Int64) { TestAtomicAdd(); } +/* template class BlockPartitionTune { public: template - __device__ std::size_t Partition(IterT begin, IterT end, OpT op) { - typedef cub::BlockScan BlockScanT; + __device__ int Partition(IterT begin, IterT end, OpT op) { + typedef cub::BlockScan BlockScanT; __shared__ typename BlockScanT::TempStorage temp1, temp2; - __shared__ std::size_t lcomp[kBlockSize]; - __shared__ std::size_t rcomp[kBlockSize]; + __shared__ int lcomp[kBlockSize]; + __shared__ int rcomp[kBlockSize]; - /* - __shared__ unsigned long long int tmp_sum; + __shared__ int64_t tmp_sum; if (threadIdx.x == 0) { tmp_sum = 0; @@ -285,41 +285,98 @@ class BlockPartitionTune { __syncthreads(); // Get left count - std::size_t count = end - begin; - std::size_t left_count = 0; - for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) { + int count = end - begin; + int left_count = 0; + for (auto idx : dh::BlockStrideRange(int(0), count)) { left_count += op(begin[idx]); } atomicAdd(&tmp_sum, left_count); __syncthreads(); left_count = tmp_sum; - */ - std::size_t count = end - begin; - std::size_t left_count = count/2; + int loffset = 0, part = left_count, roffset = part; + int llen = 0, rlen = 0, minlen = 0; + auto tid = threadIdx.x; + while (loffset < part && roffset < count) { + // find the samples in the left that belong to right and vice-versa + auto loff = loffset + tid, roff = roffset + tid; + int lflag = loff < part ? !op(begin[loff]) : 0; + int rflag = roff < count ? op(begin[roff]) : 0; + // scan to compute the locations for each 'misfit' in the two partitions + int lidx, ridx; + BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); + BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); + __syncthreads(); + minlen = llen < rlen ? llen : rlen; + // compaction to figure out the right locations to swap + if (lflag) lcomp[lidx] = loff; + if (rflag) rcomp[ridx] = roff; + __syncthreads(); + loffset += (llen == minlen) ? kBlockSize : minlen; + roffset += (rlen == minlen) ? kBlockSize : minlen; + // swap the 'misfit's + if (tid < minlen) { + auto a = begin[lcomp[tid]]; + auto b = begin[rcomp[tid]]; + begin[lcomp[tid]] = b; + begin[rcomp[tid]] = a; + } + } + return left_count; + } +}; +*/ +struct PartitionScanPair{ + int left; + int right; +}; + +template +class BlockPartitionTune { + public: + template + __device__ int Partition(IterT begin, IterT end, OpT op) { + typedef cub::BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp1, temp2; + __shared__ int lcomp[kBlockSize]; + __shared__ int rcomp[kBlockSize]; + __shared__ int64_t tmp_sum; + + if (threadIdx.x == 0) { + tmp_sum = 0; + } + __syncthreads(); - std::size_t loffset = 0, part = left_count, roffset = part; - std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0; + // Get left count + int count = end - begin; + int left_count = 0; + for (auto idx : dh::BlockStrideRange(int(0), count)) { + left_count += op(begin[idx]); + } + atomicAdd(&tmp_sum, left_count); + __syncthreads(); + left_count = tmp_sum; + + int loffset = 0, part = left_count, roffset = part; + int llen = 0, rlen = 0, minlen = 0; auto tid = threadIdx.x; while (loffset < part && roffset < count) { // find the samples in the left that belong to right and vice-versa auto loff = loffset + tid, roff = roffset + tid; - if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0; - if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0; + int lflag = loff < part ? !op(begin[loff]) : 0; + int rflag = roff < count ? op(begin[roff]) : 0; // scan to compute the locations for each 'misfit' in the two partitions - std::size_t lidx, ridx; + int lidx, ridx; BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); - //__syncthreads(); minlen = llen < rlen ? llen : rlen; // compaction to figure out the right locations to swap if (lflag) lcomp[lidx] = loff; if (rflag) rcomp[ridx] = roff; __syncthreads(); + // reset the appropriate flags for the longer of the two - if (lidx < minlen) lflag = 0; - if (ridx < minlen) rflag = 0; - if (llen == minlen) loffset += kBlockSize; - if (rlen == minlen) roffset += kBlockSize; + loffset = llen == minlen || llen == 0 ? loffset + kBlockSize : lcomp[minlen]; + roffset = rlen == minlen || rlen == 0 ? roffset + kBlockSize : rcomp[minlen]; // swap the 'misfit's if (tid < minlen) { auto a = begin[lcomp[tid]]; @@ -332,7 +389,6 @@ class BlockPartitionTune { } }; - template __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) { auto count = BlockPartitionTune().Partition(begin, end, op); @@ -403,9 +459,12 @@ TEST(BlockPartition, BlockPartition) { } TEST(BlockPartition, BlockPartitionBenchmark) { - thrust::device_vector x(10000000); - thrust::sequence(x. begin(),x.end()); - TestBlockPartition<1024>(x); + for (int i = 0; i < 20; i++) { + thrust::device_vector x(10000000); + MakeRandom(x, i); + // thrust::sequence(x.begin(), x.end()); + TestBlockPartition<1024>(x); + } } } // namespace xgboost From ba8bbdfd1bf1227b574eea7a3d4d49089e579118 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sat, 14 May 2022 15:24:10 -0700 Subject: [PATCH 21/64] Fused scan --- tests/cpp/common/test_device_helpers.cu | 35 +++++++++++++++---------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 175e57b54062..cab348e12f38 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -325,18 +325,23 @@ class BlockPartitionTune { } }; */ -struct PartitionScanPair{ +struct PartitionScanPair { int left; int right; }; +__device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) { + PartitionScanPair c{a.left + b.left, a.right + b.right}; + return c; +} + template class BlockPartitionTune { public: template __device__ int Partition(IterT begin, IterT end, OpT op) { - typedef cub::BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp1, temp2; + typedef cub::BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp; __shared__ int lcomp[kBlockSize]; __shared__ int rcomp[kBlockSize]; __shared__ int64_t tmp_sum; @@ -357,26 +362,28 @@ class BlockPartitionTune { left_count = tmp_sum; int loffset = 0, part = left_count, roffset = part; - int llen = 0, rlen = 0, minlen = 0; + int minlen = 0; auto tid = threadIdx.x; while (loffset < part && roffset < count) { // find the samples in the left that belong to right and vice-versa auto loff = loffset + tid, roff = roffset + tid; - int lflag = loff < part ? !op(begin[loff]) : 0; - int rflag = roff < count ? op(begin[roff]) : 0; + + PartitionScanPair flag; + flag.left = loff < part ? !op(begin[loff]) : 0; + flag.right = roff < count ? op(begin[roff]) : 0; // scan to compute the locations for each 'misfit' in the two partitions - int lidx, ridx; - BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); - BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); - minlen = llen < rlen ? llen : rlen; + PartitionScanPair partial_sum; + PartitionScanPair sum; + BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum); + minlen = sum.left < sum.right ? sum.left : sum.right; // compaction to figure out the right locations to swap - if (lflag) lcomp[lidx] = loff; - if (rflag) rcomp[ridx] = roff; + if (flag.left) lcomp[partial_sum.left] = loff; + if (flag.right) rcomp[partial_sum.right] = roff; __syncthreads(); // reset the appropriate flags for the longer of the two - loffset = llen == minlen || llen == 0 ? loffset + kBlockSize : lcomp[minlen]; - roffset = rlen == minlen || rlen == 0 ? roffset + kBlockSize : rcomp[minlen]; + loffset = sum.left== minlen || sum.left== 0 ? loffset + kBlockSize : lcomp[minlen]; + roffset = sum.right == minlen || sum.right == 0 ? roffset + kBlockSize : rcomp[minlen]; // swap the 'misfit's if (tid < minlen) { auto a = begin[lcomp[tid]]; From f4ef4ca798417b8630e7a018ea1596f469e37048 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sun, 15 May 2022 06:44:52 -0700 Subject: [PATCH 22/64] Register blocking --- tests/cpp/common/test_device_helpers.cu | 44 +++++++++++++------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index cab348e12f38..c685d7622164 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -338,12 +338,12 @@ __device__ PartitionScanPair operator+(const PartitionScanPair& a, const Partiti template class BlockPartitionTune { public: - template + template __device__ int Partition(IterT begin, IterT end, OpT op) { typedef cub::BlockScan BlockScanT; __shared__ typename BlockScanT::TempStorage temp; - __shared__ int lcomp[kBlockSize]; - __shared__ int rcomp[kBlockSize]; + __shared__ int lcomp[kBlockSize*kItemsPerThread]; + __shared__ int rcomp[kBlockSize*kItemsPerThread]; __shared__ int64_t tmp_sum; if (threadIdx.x == 0) { @@ -362,34 +362,36 @@ class BlockPartitionTune { left_count = tmp_sum; int loffset = 0, part = left_count, roffset = part; - int minlen = 0; auto tid = threadIdx.x; while (loffset < part && roffset < count) { // find the samples in the left that belong to right and vice-versa - auto loff = loffset + tid, roff = roffset + tid; - - PartitionScanPair flag; - flag.left = loff < part ? !op(begin[loff]) : 0; - flag.right = roff < count ? op(begin[roff]) : 0; + auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread; + + PartitionScanPair flag[kItemsPerThread]; + for (int i = 0; i < kItemsPerThread; i++) { + flag[i].left = loff+i < part ? !op(begin[loff+i]) : 0; + flag[i].right = roff+i < count ? op(begin[roff+i]) : 0; + } // scan to compute the locations for each 'misfit' in the two partitions - PartitionScanPair partial_sum; + PartitionScanPair partial_sum[kItemsPerThread]; PartitionScanPair sum; BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum); - minlen = sum.left < sum.right ? sum.left : sum.right; + int minlen = sum.left < sum.right ? sum.left : sum.right; // compaction to figure out the right locations to swap - if (flag.left) lcomp[partial_sum.left] = loff; - if (flag.right) rcomp[partial_sum.right] = roff; + for (int i = 0; i < kItemsPerThread; i++) { + if (flag[i].left) lcomp[partial_sum[i].left] = loff+i; + if (flag[i].right) rcomp[partial_sum[i].right] = roff+i; + } __syncthreads(); - // reset the appropriate flags for the longer of the two - loffset = sum.left== minlen || sum.left== 0 ? loffset + kBlockSize : lcomp[minlen]; - roffset = sum.right == minlen || sum.right == 0 ? roffset + kBlockSize : rcomp[minlen]; + loffset = sum.left == minlen ? loffset + kBlockSize*kItemsPerThread : lcomp[minlen]; + roffset = sum.right == minlen ? roffset + kBlockSize*kItemsPerThread : rcomp[minlen]; // swap the 'misfit's - if (tid < minlen) { - auto a = begin[lcomp[tid]]; - auto b = begin[rcomp[tid]]; - begin[lcomp[tid]] = b; - begin[rcomp[tid]] = a; + for(int i = tid; i < minlen; i += kBlockSize){ + auto a = begin[lcomp[i]]; + auto b = begin[rcomp[i]]; + begin[lcomp[i]] = b; + begin[rcomp[i]] = a; } } return left_count; From 9c27dd09df727f02c51df892e9aa8fa19f421f4a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 17 May 2022 02:17:13 -0700 Subject: [PATCH 23/64] Cleanup --- src/common/device_helpers.cuh | 77 ++++++++------ src/tree/gpu_hist/row_partitioner.cuh | 19 ++-- tests/cpp/common/test_device_helpers.cu | 135 +----------------------- 3 files changed, 58 insertions(+), 173 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 20cb951e8805..3fdb994fce3d 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1640,15 +1640,26 @@ class CUDAStream { void Sync() { this->View().Sync(); } }; +struct PartitionScanPair { + int left; + int right; +}; + +inline __device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) { + PartitionScanPair c{a.left + b.left, a.right + b.right}; + return c; +} + template -class BlockPartition { +class BlockPartition{ public: - template - __device__ std::size_t Partition(IterT begin, IterT end, OpT op) { - typedef cub::BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp1, temp2; - __shared__ std::size_t lcomp[kBlockSize]; - __shared__ std::size_t rcomp[kBlockSize]; + template + __device__ int Partition(IterT begin, IterT end, OpT op) { + typedef cub::BlockScan BlockScanT; + __shared__ typename BlockScanT::TempStorage temp; + + __shared__ int16_t lcomp[kBlockSize*kItemsPerThread]; + __shared__ int16_t rcomp[kBlockSize*kItemsPerThread]; __shared__ unsigned long long int tmp_sum; if (threadIdx.x == 0) { @@ -1657,45 +1668,47 @@ class BlockPartition { __syncthreads(); // Get left count - std::size_t count = end - begin; - std::size_t left_count = 0; - for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) { + int count = end - begin; + int left_count = 0; + for (auto idx : dh::BlockStrideRange(int(0), count)) { left_count += op(begin[idx]); } atomicAdd(&tmp_sum, left_count); __syncthreads(); left_count = tmp_sum; - std::size_t loffset = 0, part = left_count, roffset = part; - std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0; + int loffset = 0, part = left_count, roffset = part; auto tid = threadIdx.x; while (loffset < part && roffset < count) { // find the samples in the left that belong to right and vice-versa - auto loff = loffset + tid, roff = roffset + tid; - if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0; - if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0; + auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread; + + PartitionScanPair flag[kItemsPerThread]; + for (int i = 0; i < kItemsPerThread; i++) { + flag[i].left = loff + i < part ? !op(begin[loff + i]) : 0; + flag[i].right = roff + i < count ? op(begin[roff + i]) : 0; + } // scan to compute the locations for each 'misfit' in the two partitions - std::size_t lidx, ridx; - BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); - BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); - __syncthreads(); - minlen = llen < rlen ? llen : rlen; + PartitionScanPair partial_sum[kItemsPerThread]; + PartitionScanPair sum; + BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum); + int minlen = sum.left < sum.right ? sum.left : sum.right; // compaction to figure out the right locations to swap - if (lflag) lcomp[lidx] = loff; - if (rflag) rcomp[ridx] = roff; + for (int i = 0; i < kItemsPerThread; i++) { + if (flag[i].left) lcomp[partial_sum[i].left] = tid * kItemsPerThread+i; + if (flag[i].right) rcomp[partial_sum[i].right] = tid * kItemsPerThread+i; + } __syncthreads(); - // reset the appropriate flags for the longer of the two - if (lidx < minlen) lflag = 0; - if (ridx < minlen) rflag = 0; - if (llen == minlen) loffset += kBlockSize; - if (rlen == minlen) roffset += kBlockSize; + // swap the 'misfit's - if (tid < minlen) { - auto a = begin[lcomp[tid]]; - auto b = begin[rcomp[tid]]; - begin[lcomp[tid]] = b; - begin[rcomp[tid]] = a; + for (int i = tid; i < minlen; i += kBlockSize) { + auto a = begin[lcomp[i] + loffset]; + auto b = begin[rcomp[i] + roffset]; + begin[lcomp[i] + loffset] = b; + begin[rcomp[i] + roffset] = a; } + loffset = sum.left == minlen ? loffset + kBlockSize * kItemsPerThread : loffset + lcomp[minlen]; + roffset = sum.right == minlen ? roffset + kBlockSize * kItemsPerThread : roffset + rcomp[minlen]; } return left_count; } diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index e779ced8dfdf..d1a374ec8468 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -37,18 +37,23 @@ struct UpdatePositionBatchArgs { }; template -__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args, +__global__ void +__launch_bounds__(1024, 1) +UpdatePositionBatchKernel(UpdatePositionBatchArgs args, OpT op, common::Span ridx, common::Span position, common::Span left_counts) { - auto segment = args.segments_batch[blockIdx.x]; - auto data = args.data_batch[blockIdx.x]; - auto ridx_segment = ridx.subspan(segment.begin, segment.Size()); - auto position_segment = position.subspan(segment.begin, segment.Size()); + + + const auto& segment = args.segments_batch[blockIdx.x]; + const auto& data = args.data_batch[blockIdx.x]; + const auto& ridx_segment = ridx.subspan(segment.begin, segment.Size()); + const auto& position_segment = position.subspan(segment.begin, segment.Size()); auto left_nidx = args.left_nidx_batch[blockIdx.x]; auto left_count = dh::BlockPartition().Partition( - ridx_segment.data(), ridx_segment.data()+ridx_segment.size(), [=] __device__(auto e) { return op(e, data) == left_nidx; }); + ridx_segment.data(), ridx_segment.data() + ridx_segment.size(), + [&] __device__(auto e) { return op(e, data) == left_nidx; }); if (threadIdx.x == 0) { left_counts[blockIdx.x] = left_count; @@ -138,7 +143,7 @@ class RowPartitioner { } // 1 block per node - constexpr int kBlockSize = 512; + constexpr int kBlockSize = 1024; UpdatePositionBatchKernel<<>>( args, op, dh::ToSpan(ridx_), dh::ToSpan(position_), left_counts); diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index c685d7622164..18fdb5b7eb34 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -266,141 +266,9 @@ TEST(AtomicAdd, Int64) { TestAtomicAdd(); } -/* -template -class BlockPartitionTune { - public: - template - __device__ int Partition(IterT begin, IterT end, OpT op) { - typedef cub::BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp1, temp2; - __shared__ int lcomp[kBlockSize]; - __shared__ int rcomp[kBlockSize]; - - __shared__ int64_t tmp_sum; - - if (threadIdx.x == 0) { - tmp_sum = 0; - } - __syncthreads(); - - // Get left count - int count = end - begin; - int left_count = 0; - for (auto idx : dh::BlockStrideRange(int(0), count)) { - left_count += op(begin[idx]); - } - atomicAdd(&tmp_sum, left_count); - __syncthreads(); - left_count = tmp_sum; - int loffset = 0, part = left_count, roffset = part; - int llen = 0, rlen = 0, minlen = 0; - auto tid = threadIdx.x; - while (loffset < part && roffset < count) { - // find the samples in the left that belong to right and vice-versa - auto loff = loffset + tid, roff = roffset + tid; - int lflag = loff < part ? !op(begin[loff]) : 0; - int rflag = roff < count ? op(begin[roff]) : 0; - // scan to compute the locations for each 'misfit' in the two partitions - int lidx, ridx; - BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen); - BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen); - __syncthreads(); - minlen = llen < rlen ? llen : rlen; - // compaction to figure out the right locations to swap - if (lflag) lcomp[lidx] = loff; - if (rflag) rcomp[ridx] = roff; - __syncthreads(); - loffset += (llen == minlen) ? kBlockSize : minlen; - roffset += (rlen == minlen) ? kBlockSize : minlen; - // swap the 'misfit's - if (tid < minlen) { - auto a = begin[lcomp[tid]]; - auto b = begin[rcomp[tid]]; - begin[lcomp[tid]] = b; - begin[rcomp[tid]] = a; - } - } - return left_count; - } -}; -*/ -struct PartitionScanPair { - int left; - int right; -}; - -__device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) { - PartitionScanPair c{a.left + b.left, a.right + b.right}; - return c; -} - -template -class BlockPartitionTune { - public: - template - __device__ int Partition(IterT begin, IterT end, OpT op) { - typedef cub::BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp; - __shared__ int lcomp[kBlockSize*kItemsPerThread]; - __shared__ int rcomp[kBlockSize*kItemsPerThread]; - __shared__ int64_t tmp_sum; - - if (threadIdx.x == 0) { - tmp_sum = 0; - } - __syncthreads(); - - // Get left count - int count = end - begin; - int left_count = 0; - for (auto idx : dh::BlockStrideRange(int(0), count)) { - left_count += op(begin[idx]); - } - atomicAdd(&tmp_sum, left_count); - __syncthreads(); - left_count = tmp_sum; - - int loffset = 0, part = left_count, roffset = part; - auto tid = threadIdx.x; - while (loffset < part && roffset < count) { - // find the samples in the left that belong to right and vice-versa - auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread; - - PartitionScanPair flag[kItemsPerThread]; - for (int i = 0; i < kItemsPerThread; i++) { - flag[i].left = loff+i < part ? !op(begin[loff+i]) : 0; - flag[i].right = roff+i < count ? op(begin[roff+i]) : 0; - } - // scan to compute the locations for each 'misfit' in the two partitions - PartitionScanPair partial_sum[kItemsPerThread]; - PartitionScanPair sum; - BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum); - int minlen = sum.left < sum.right ? sum.left : sum.right; - // compaction to figure out the right locations to swap - for (int i = 0; i < kItemsPerThread; i++) { - if (flag[i].left) lcomp[partial_sum[i].left] = loff+i; - if (flag[i].right) rcomp[partial_sum[i].right] = roff+i; - } - __syncthreads(); - - loffset = sum.left == minlen ? loffset + kBlockSize*kItemsPerThread : lcomp[minlen]; - roffset = sum.right == minlen ? roffset + kBlockSize*kItemsPerThread : rcomp[minlen]; - // swap the 'misfit's - for(int i = tid; i < minlen; i += kBlockSize){ - auto a = begin[lcomp[i]]; - auto b = begin[rcomp[i]]; - begin[lcomp[i]] = b; - begin[rcomp[i]] = a; - } - } - return left_count; - } -}; - template __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) { - auto count = BlockPartitionTune().Partition(begin, end, op); + auto count = dh::BlockPartition().Partition(begin, end, op); if (threadIdx.x == 0) { *count_out = count; } @@ -471,7 +339,6 @@ TEST(BlockPartition, BlockPartitionBenchmark) { for (int i = 0; i < 20; i++) { thrust::device_vector x(10000000); MakeRandom(x, i); - // thrust::sequence(x.begin(), x.end()); TestBlockPartition<1024>(x); } } From 0bcc84ac7d2420b1df74293cfe120999b0f9a9d8 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 18 May 2022 04:38:34 -0700 Subject: [PATCH 24/64] Working tests --- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index e4e5c9dacb60..70412832802a 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -116,5 +116,160 @@ void TestFinalise() { TEST(RowPartitioner, Finalise) { TestFinalise(); } + +const int kMaxBatch = 32; +template +struct KernelArgs { + Segment segments[kMaxBatch]; + OpDataT data[kMaxBatch]; + + // Given a global thread idx, assign it to an item from one of the segments + __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const { + std::size_t sum = 0; + for (int i = 0; i < kMaxBatch; i++) { + if (sum + segments[i].Size() > idx) { + batch_idx = i; + item_idx = (idx - sum) + segments[i].begin; + break; + } + sum += segments[i].Size(); + } + } + std::size_t TotalRows() const { + std::size_t total_rows = 0; + for (auto segment : segments) { + total_rows += segment.Size(); + } + return total_rows; + } +}; + +template +void GetLeftCounts(const KernelArgs&args,common::Span ridx, + common::Span d_left_counts, OpT op + ) { + + // Launch 1 thread for each row + dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) { + // Assign this thread to a row + int batch_idx; + std::size_t item_idx; + args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], args.data[batch_idx]); + atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx])); + }); +} + +struct IndexFlagTuple { + size_t idx; + bool flag; + size_t flag_scan; + int batch_idx; +}; + +struct IndexFlagOp { + __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { + if (a.batch_idx == b.batch_idx) { + return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx}; + } else { + return b; + } + } +}; + +template +struct WriteResultsFunctor { + KernelArgs args; + OpT op; + common::Span ridx_in; + common::Span ridx_out; + common::Span left_counts; + + __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { + // the ex_scan_result represents how many rows have been assigned to left + // node so far during scan. + std::size_t scatter_address; + if (x.flag) { + scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1; // -1 because inclusive scan + } else { + // current number of rows belong to right node + total number of rows + // belong to left node + scatter_address = (x.idx - x.flag_scan) + left_counts[x.batch_idx]; + } + ridx_out[scatter_address] = ridx_in[x.idx]; + // Discard + return {}; + } +}; + +template +void SortPositionBatch(const KernelArgs& args, common::Span ridx, + common::Span ridx_tmp, + common::Span left_counts, OpT op, + cudaStream_t stream) { + WriteResultsFunctor write_results{args,op,ridx, ridx_tmp, left_counts}; + auto discard_write_iterator = + thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); + auto counting = thrust::make_counting_iterator(0llu); + auto input_iterator = + dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { + int batch_idx; + std::size_t item_idx; + args.AssignBatch(idx, batch_idx, item_idx); + auto go_left = op(ridx[item_idx], args.data[batch_idx]); + return IndexFlagTuple{item_idx, go_left,go_left, batch_idx}; + }); + size_t temp_bytes = 0; + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, + discard_write_iterator, IndexFlagOp(), + args.TotalRows(), stream); + dh::TemporaryArray temp(temp_bytes); + cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, + discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); + + // copy active segments back to original buffer + dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) { + // Assign this thread to a row + int batch_idx; + std::size_t item_idx; + args.AssignBatch(idx, batch_idx, item_idx); + ridx[item_idx] = ridx_tmp[item_idx]; + }); +} + +void TestSortPositionBatch(const std::vector& ridx_in, const std::vector& segments) { + thrust::device_vector ridx = ridx_in; + thrust::device_vector ridx_tmp(ridx_in.size()); + thrust::device_vector left_counts(segments.size()); + + auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; }; + std::vector op_data(segments.size()); + KernelArgs args; + std::copy(segments.begin(), segments.end(), args.segments); + std::copy(op_data.begin(), op_data.end(), args.data); + GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(left_counts), op); + SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(left_counts), op, + nullptr); + + auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; + for (int i = 0; i < segments.size(); i++) { + auto begin = ridx.begin() + segments[i].begin; + auto end = ridx.begin() + segments[i].end; + auto left_partition_count = + thrust::count_if(thrust::device, begin, begin + left_counts[i], op_without_data); + EXPECT_EQ(left_partition_count, left_counts[i]); + auto right_partition_count = + thrust::count_if(thrust::device, begin + left_counts[i], end, op_without_data); + EXPECT_EQ(right_partition_count, 0); + } +} + +TEST(GpuHist, SortPositionBatch) { + TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}}); + TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}}); + TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}}); + TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}}); +} + } // namespace tree } // namespace xgboost From 723ff475fda9e3762f5f9af9424e53c0f952dffc Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 18 May 2022 05:08:38 -0700 Subject: [PATCH 25/64] Transplanted new code --- src/tree/gpu_hist/row_partitioner.cu | 2 +- src/tree/gpu_hist/row_partitioner.cuh | 168 ++++++++++++++---- src/tree/updater_gpu_hist.cu | 12 +- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 132 +------------- 4 files changed, 135 insertions(+), 179 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 8fbded53f913..91933c6d2896 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -20,7 +20,7 @@ void Reset(int device_idx, common::Span ridx, } RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_(num_rows), position_(num_rows) { + : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),position_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(Segment(0, num_rows)); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index d1a374ec8468..6de8ddff3993 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -26,38 +26,125 @@ struct Segment { __host__ __device__ size_t Size() const { return end - begin; } }; -constexpr int kUpdatePositionMaxBatch = 32; + +const int kMaxBatch = 32; template -struct UpdatePositionBatchArgs { - bst_node_t nidx_batch[kUpdatePositionMaxBatch]; - bst_node_t left_nidx_batch[kUpdatePositionMaxBatch]; - bst_node_t right_nidx_batch[kUpdatePositionMaxBatch]; - Segment segments_batch[kUpdatePositionMaxBatch]; - OpDataT data_batch[kUpdatePositionMaxBatch]; +struct KernelArgs { + Segment segments[kMaxBatch]; + OpDataT data[kMaxBatch]; + + // Given a global thread idx, assign it to an item from one of the segments + __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const { + std::size_t sum = 0; + for (int i = 0; i < kMaxBatch; i++) { + if (sum + segments[i].Size() > idx) { + batch_idx = i; + item_idx = (idx - sum) + segments[i].begin; + break; + } + sum += segments[i].Size(); + } + } + std::size_t TotalRows() const { + std::size_t total_rows = 0; + for (auto segment : segments) { + total_rows += segment.Size(); + } + return total_rows; + } }; -template -__global__ void -__launch_bounds__(1024, 1) -UpdatePositionBatchKernel(UpdatePositionBatchArgs args, - OpT op, common::Span ridx, - common::Span position, - common::Span left_counts) { +template +void GetLeftCounts(const KernelArgs&args,common::Span ridx, + common::Span d_left_counts, OpT op + ) { + // Launch 1 thread for each row + dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) { + // Assign this thread to a row + int batch_idx; + std::size_t item_idx; + args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], args.data[batch_idx]); + atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx])); + }); +} + +struct IndexFlagTuple { + size_t idx; + bool flag; + size_t flag_scan; + int batch_idx; +}; - const auto& segment = args.segments_batch[blockIdx.x]; - const auto& data = args.data_batch[blockIdx.x]; - const auto& ridx_segment = ridx.subspan(segment.begin, segment.Size()); - const auto& position_segment = position.subspan(segment.begin, segment.Size()); +struct IndexFlagOp { + __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { + if (a.batch_idx == b.batch_idx) { + return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx}; + } else { + return b; + } + } +}; - auto left_nidx = args.left_nidx_batch[blockIdx.x]; - auto left_count = dh::BlockPartition().Partition( - ridx_segment.data(), ridx_segment.data() + ridx_segment.size(), - [&] __device__(auto e) { return op(e, data) == left_nidx; }); +template +struct WriteResultsFunctor { + KernelArgs args; + OpT op; + common::Span ridx_in; + common::Span ridx_out; + common::Span left_counts; - if (threadIdx.x == 0) { - left_counts[blockIdx.x] = left_count; + __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { + // the ex_scan_result represents how many rows have been assigned to left + // node so far during scan. + std::size_t scatter_address; + if (x.flag) { + scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1; // -1 because inclusive scan + } else { + // current number of rows belong to right node + total number of rows + // belong to left node + scatter_address = (x.idx - x.flag_scan) + left_counts[x.batch_idx]; + } + ridx_out[scatter_address] = ridx_in[x.idx]; + // Discard + return {}; } +}; + +template +void SortPositionBatch(const KernelArgs& args, common::Span ridx, + common::Span ridx_tmp, + common::Span left_counts, OpT op, + cudaStream_t stream) { + WriteResultsFunctor write_results{args,op,ridx, ridx_tmp, left_counts}; + auto discard_write_iterator = + thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); + auto counting = thrust::make_counting_iterator(0llu); + auto input_iterator = + dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { + int batch_idx; + std::size_t item_idx; + args.AssignBatch(idx, batch_idx, item_idx); + auto go_left = op(ridx[item_idx], args.data[batch_idx]); + return IndexFlagTuple{item_idx, go_left,go_left, batch_idx}; + }); + size_t temp_bytes = 0; + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, + discard_write_iterator, IndexFlagOp(), + args.TotalRows(), stream); + dh::TemporaryArray temp(temp_bytes); + cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, + discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); + + // copy active segments back to original buffer + dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) { + // Assign this thread to a row + int batch_idx; + std::size_t item_idx; + args.AssignBatch(idx, batch_idx, item_idx); + ridx[item_idx] = ridx_tmp[item_idx]; + }); } /** \brief Class responsible for tracking subsets of rows as we add splits and @@ -84,6 +171,8 @@ class RowPartitioner { * rows idx | 3, 5, 1 | 13, 31 | */ dh::TemporaryArray ridx_; + // Staging area for sorting ridx + dh::TemporaryArray ridx_tmp_; /*! \brief mapping for row -> node id. */ dh::TemporaryArray position_; dh::PinnedMemory pinned_; @@ -129,31 +218,32 @@ class RowPartitioner { CHECK_EQ(nidx.size(), left_nidx.size()); CHECK_EQ(nidx.size(), right_nidx.size()); CHECK_EQ(nidx.size(), op_data.size()); - CHECK_LE(nidx.size(), kUpdatePositionMaxBatch); - auto left_counts = pinned_.GetSpan(nidx.size(), 0); + CHECK_LE(nidx.size(), kMaxBatch); + auto h_left_counts = pinned_.GetSpan(nidx.size(), 0); + dh::TemporaryArray d_left_counts(nidx.size(), 0); // Prepare kernel arguments - UpdatePositionBatchArgs args; - std::copy(nidx.begin(),nidx.end(),args.nidx_batch); - std::copy(left_nidx.begin(),left_nidx.end(),args.left_nidx_batch); - std::copy(right_nidx.begin(),right_nidx.end(),args.right_nidx_batch); - std::copy(op_data.begin(),op_data.end(),args.data_batch); - for(int i = 0; i < nidx.size(); i++){ - args.segments_batch[i]=ridx_segments_.at(nidx[i]); + KernelArgs args; + std::copy(op_data.begin(), op_data.end(), args.data); + for (int i = 0; i < nidx.size(); i++) { + args.segments[i] = ridx_segments_.at(nidx[i]); } + GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(d_left_counts), op); + + dh::safe_cuda( + cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(), + sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(), + cudaMemcpyDefault, nullptr)); - // 1 block per node - constexpr int kBlockSize = 1024; - UpdatePositionBatchKernel<<>>( - args, op, dh::ToSpan(ridx_), - dh::ToSpan(position_), left_counts); + SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_left_counts), op, + nullptr); dh::safe_cuda(cudaDeviceSynchronize()); // Update segments for (int i = 0; i < nidx.size(); i++) { auto segment=ridx_segments_.at(nidx[i]); - auto left_count = left_counts[i]; + auto left_count = h_left_counts[i]; CHECK_LE(left_count, segment.Size()); CHECK_GE(left_count, 0); ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3905cd233aac..9b1982195fab 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -386,23 +386,17 @@ struct GPUHistMakerDevice { // given a row index, returns the node id it belongs to bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex()); // Missing value - bst_node_t new_position = 0; + bool go_left = true; if (isnan(cut_value)) { - new_position = data.split_node.DefaultChild(); + go_left = data.split_node.DefaultLeft(); } else { - bool go_left = true; if (data.split_type == FeatureType::kCategorical) { go_left = common::Decision(data.node_cats.Bits(), cut_value, data.split_node.DefaultLeft()); } else { go_left = cut_value <= data.split_node.SplitCond(); } - if (go_left) { - new_position = data.split_node.LeftChild(); - } else { - new_position = data.split_node.RightChild(); - } } - return new_position; + return go_left; }); } diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 70412832802a..e16c1b719426 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -31,12 +31,7 @@ void TestUpdatePositionBatch() { // Send the first five training instances to the right node // and the second 5 to the left node rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) { - if (ridx > 4) { - return 1; - } - else { - return 2; - } + return ridx > 4; }); rows = rp.GetRowsHost(1); for (auto r : rows) { @@ -49,10 +44,7 @@ void TestUpdatePositionBatch() { // Split the left node again rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) { - if (ridx < 7) { - return 3; - } - return 4; + return ridx < 7; }); EXPECT_EQ(rp.GetRows(3).size(), 2); EXPECT_EQ(rp.GetRows(4).size(), 3); @@ -117,126 +109,6 @@ void TestFinalise() { TEST(RowPartitioner, Finalise) { TestFinalise(); } -const int kMaxBatch = 32; -template -struct KernelArgs { - Segment segments[kMaxBatch]; - OpDataT data[kMaxBatch]; - - // Given a global thread idx, assign it to an item from one of the segments - __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const { - std::size_t sum = 0; - for (int i = 0; i < kMaxBatch; i++) { - if (sum + segments[i].Size() > idx) { - batch_idx = i; - item_idx = (idx - sum) + segments[i].begin; - break; - } - sum += segments[i].Size(); - } - } - std::size_t TotalRows() const { - std::size_t total_rows = 0; - for (auto segment : segments) { - total_rows += segment.Size(); - } - return total_rows; - } -}; - -template -void GetLeftCounts(const KernelArgs&args,common::Span ridx, - common::Span d_left_counts, OpT op - ) { - - // Launch 1 thread for each row - dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) { - // Assign this thread to a row - int batch_idx; - std::size_t item_idx; - args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], args.data[batch_idx]); - atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx])); - }); -} - -struct IndexFlagTuple { - size_t idx; - bool flag; - size_t flag_scan; - int batch_idx; -}; - -struct IndexFlagOp { - __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { - if (a.batch_idx == b.batch_idx) { - return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx}; - } else { - return b; - } - } -}; - -template -struct WriteResultsFunctor { - KernelArgs args; - OpT op; - common::Span ridx_in; - common::Span ridx_out; - common::Span left_counts; - - __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { - // the ex_scan_result represents how many rows have been assigned to left - // node so far during scan. - std::size_t scatter_address; - if (x.flag) { - scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1; // -1 because inclusive scan - } else { - // current number of rows belong to right node + total number of rows - // belong to left node - scatter_address = (x.idx - x.flag_scan) + left_counts[x.batch_idx]; - } - ridx_out[scatter_address] = ridx_in[x.idx]; - // Discard - return {}; - } -}; - -template -void SortPositionBatch(const KernelArgs& args, common::Span ridx, - common::Span ridx_tmp, - common::Span left_counts, OpT op, - cudaStream_t stream) { - WriteResultsFunctor write_results{args,op,ridx, ridx_tmp, left_counts}; - auto discard_write_iterator = - thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); - auto counting = thrust::make_counting_iterator(0llu); - auto input_iterator = - dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { - int batch_idx; - std::size_t item_idx; - args.AssignBatch(idx, batch_idx, item_idx); - auto go_left = op(ridx[item_idx], args.data[batch_idx]); - return IndexFlagTuple{item_idx, go_left,go_left, batch_idx}; - }); - size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp(), - args.TotalRows(), stream); - dh::TemporaryArray temp(temp_bytes); - cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); - - // copy active segments back to original buffer - dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) { - // Assign this thread to a row - int batch_idx; - std::size_t item_idx; - args.AssignBatch(idx, batch_idx, item_idx); - ridx[item_idx] = ridx_tmp[item_idx]; - }); -} - void TestSortPositionBatch(const std::vector& ridx_in, const std::vector& segments) { thrust::device_vector ridx = ridx_in; thrust::device_vector ridx_tmp(ridx_in.size()); From 199bed96318ae7e651432aeb94b64037fac8b74d Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 19 May 2022 03:45:38 -0700 Subject: [PATCH 26/64] Optimised --- src/tree/gpu_hist/row_partitioner.cu | 13 ++- src/tree/gpu_hist/row_partitioner.cuh | 97 +++++++++++-------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 5 +- 3 files changed, 70 insertions(+), 45 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 91933c6d2896..e13f6a8eb188 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -20,11 +20,22 @@ void Reset(int device_idx, common::Span ridx, } RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),position_(num_rows) { + : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_tmp_(num_rows),position_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(Segment(0, num_rows)); Reset(device_idx, dh::ToSpan(ridx_), dh::ToSpan(position_)); + streams_.resize(2); + for (auto& stream : streams_) { + dh::safe_cuda(cudaStreamCreate(&stream)); + } +} + +RowPartitioner::~RowPartitioner() { + dh::safe_cuda(cudaSetDevice(device_idx_)); + for (auto& stream : streams_) { + dh::safe_cuda(cudaStreamDestroy(stream)); + } } common::Span RowPartitioner::GetRows( diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 6de8ddff3993..9d00e3528307 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -34,9 +34,9 @@ struct KernelArgs { OpDataT data[kMaxBatch]; // Given a global thread idx, assign it to an item from one of the segments - __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const { + __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) const { std::size_t sum = 0; - for (int i = 0; i < kMaxBatch; i++) { + for (int16_t i = 0; i < kMaxBatch; i++) { if (sum + segments[i].Size() > idx) { batch_idx = i; item_idx = (idx - sum) + segments[i].begin; @@ -54,53 +54,73 @@ struct KernelArgs { } }; +// Should be 16 bytes aligned +struct IndexFlagTuple { + bst_uint idx; + bst_uint flag_scan; + bst_uint segment_start; + int16_t batch_idx; + bool flag; +}; + +/*! \brief Count how many rows are assigned to left node. */ +__forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment, + int batch_idx) { + int mask = __activemask(); + bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, 0)); + // If all threads here are working on the same node + // we can do a more efficient reduction with warp intrinsics + if (group_is_contiguous) { + unsigned ballot = __ballot_sync(mask, increment); + int leader = __ffs(mask) - 1; + if (threadIdx.x % 32 == leader) { + atomicAdd(d_counts + batch_idx, // NOLINT + __popc(ballot)); // NOLINT + } + } else { + atomicAdd(d_counts + batch_idx, increment); + } +} + template -void GetLeftCounts(const KernelArgs&args,common::Span ridx, +void GetLeftCounts(const KernelArgs&args,common::Span ridx,common::Span scan_tmp, common::Span d_left_counts, OpT op ) { // Launch 1 thread for each row dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) { // Assign this thread to a row - int batch_idx; + int16_t batch_idx; std::size_t item_idx; args.AssignBatch(idx, batch_idx, item_idx); auto op_res = op(ridx[item_idx], args.data[batch_idx]); - atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx])); + scan_tmp[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; + + AtomicIncrement(d_left_counts.data(),op(ridx[item_idx], args.data[batch_idx]), batch_idx); }); } -struct IndexFlagTuple { - size_t idx; - bool flag; - size_t flag_scan; - int batch_idx; -}; - struct IndexFlagOp { __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { if (a.batch_idx == b.batch_idx) { - return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx}; + return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag}; } else { return b; } } }; -template struct WriteResultsFunctor { - KernelArgs args; - OpT op; - common::Span ridx_in; - common::Span ridx_out; - common::Span left_counts; + bst_uint* ridx_in; + bst_uint* ridx_out; + unsigned long long int* left_counts; __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { // the ex_scan_result represents how many rows have been assigned to left // node so far during scan. std::size_t scatter_address; if (x.flag) { - scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1; // -1 because inclusive scan + scatter_address = x.segment_start + x.flag_scan - 1; // -1 because inclusive scan } else { // current number of rows belong to right node + total number of rows // belong to left node @@ -114,35 +134,25 @@ struct WriteResultsFunctor { template void SortPositionBatch(const KernelArgs& args, common::Span ridx, - common::Span ridx_tmp, + common::Span ridx_tmp, common::Span scan_tmp, common::Span left_counts, OpT op, cudaStream_t stream) { - WriteResultsFunctor write_results{args,op,ridx, ridx_tmp, left_counts}; + static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned."); + WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()}; auto discard_write_iterator = thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); auto counting = thrust::make_counting_iterator(0llu); - auto input_iterator = - dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { - int batch_idx; - std::size_t item_idx; - args.AssignBatch(idx, batch_idx, item_idx); - auto go_left = op(ridx[item_idx], args.data[batch_idx]); - return IndexFlagTuple{item_idx, go_left,go_left, batch_idx}; - }); size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_tmp.data(), discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); dh::TemporaryArray temp(temp_bytes); - cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, + cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, scan_tmp.data(), discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); // copy active segments back to original buffer dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) { - // Assign this thread to a row - int batch_idx; - std::size_t item_idx; - args.AssignBatch(idx, batch_idx, item_idx); + auto item_idx = scan_tmp[idx].idx; ridx[item_idx] = ridx_tmp[item_idx]; }); } @@ -173,12 +183,15 @@ class RowPartitioner { dh::TemporaryArray ridx_; // Staging area for sorting ridx dh::TemporaryArray ridx_tmp_; + dh::TemporaryArray scan_tmp_; /*! \brief mapping for row -> node id. */ dh::TemporaryArray position_; dh::PinnedMemory pinned_; + std::vector streams_; public: RowPartitioner(int device_idx, size_t num_rows); + ~RowPartitioner(); RowPartitioner(const RowPartitioner&) = delete; RowPartitioner& operator=(const RowPartitioner&) = delete; @@ -228,21 +241,21 @@ class RowPartitioner { for (int i = 0; i < nidx.size(); i++) { args.segments[i] = ridx_segments_.at(nidx[i]); } - GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(d_left_counts), op); + GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_tmp_), dh::ToSpan(d_left_counts), op); dh::safe_cuda( cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(), sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(), - cudaMemcpyDefault, nullptr)); + cudaMemcpyDefault, streams_[0])); - SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_left_counts), op, - nullptr); + SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_tmp_), + dh::ToSpan(d_left_counts), op, streams_[1]); - dh::safe_cuda(cudaDeviceSynchronize()); + dh::safe_cuda(cudaStreamSynchronize(streams_[0])); // Update segments for (int i = 0; i < nidx.size(); i++) { - auto segment=ridx_segments_.at(nidx[i]); + auto segment = ridx_segments_.at(nidx[i]); auto left_count = h_left_counts[i]; CHECK_LE(left_count, segment.Size()); CHECK_GE(left_count, 0); diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index e16c1b719426..5ad2dbc3fc3a 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -113,14 +113,15 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector ridx = ridx_in; thrust::device_vector ridx_tmp(ridx_in.size()); thrust::device_vector left_counts(segments.size()); + thrust::device_vector scan_tmp(ridx_in.size()); auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; }; std::vector op_data(segments.size()); KernelArgs args; std::copy(segments.begin(), segments.end(), args.segments); std::copy(op_data.begin(), op_data.end(), args.data); - GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(left_counts), op); - SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(left_counts), op, + GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(scan_tmp),dh::ToSpan(left_counts), op); + SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), op, nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; From 0e35e9949ccf2ecb556be7092679d537bd8c14cf Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 19 May 2022 06:40:05 -0700 Subject: [PATCH 27/64] Do not initialise data structures to maximum possible tree size. --- src/tree/split_evaluator.h | 15 +++++++++++++-- src/tree/updater_gpu_hist.cu | 17 +++++++++++------ 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h index 8cdf88834559..ee24f11ccaa0 100644 --- a/src/tree/split_evaluator.h +++ b/src/tree/split_evaluator.h @@ -13,6 +13,7 @@ #include #include #include +#include #include "xgboost/tree_model.h" #include "xgboost/host_device_vector.h" @@ -49,8 +50,9 @@ class TreeEvaluator { } else { monotone_.HostVector() = p.monotone_constraints; monotone_.HostVector().resize(n_features, 0); - lower_bounds_.Resize(p.MaxNodes(), -std::numeric_limits::max()); - upper_bounds_.Resize(p.MaxNodes(), std::numeric_limits::max()); + // Initialised to some small size, can grow if needed + lower_bounds_.Resize(256, -std::numeric_limits::max()); + upper_bounds_.Resize(256, std::numeric_limits::max()); has_constraint_ = true; } @@ -157,6 +159,15 @@ class TreeEvaluator { if (!has_constraint_) { return; } + + auto max_nidx = std::max(leftid, rightid); + if (lower_bounds_.Size() <= max_nidx) { + lower_bounds_.Resize(max_nidx * 2 + 1); + } + if (upper_bounds_.Size() <= max_nidx) { + upper_bounds_.Resize(max_nidx * 2 + 1); + } + common::Transform<>::Init( [=] XGBOOST_DEVICE(size_t, common::Span lower, common::Span upper, diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 9b1982195fab..e72de08b203b 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -223,7 +223,7 @@ struct GPUHistMakerDevice { // Copy assigning an empty vector causes an exception in MSVC debug builds monotone_constraints = param.monotone_constraints; } - node_sum_gradients.resize(param.MaxNodes()); + node_sum_gradients.resize(256); // Init histogram hist.Init(ctx_->gpu_id, page->Cuts().TotalBins()); @@ -625,12 +625,17 @@ struct GPUHistMakerDevice { } evaluator_.ApplyTreeSplit(candidate, p_tree); - node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum; - node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum; + const auto& parent = tree[candidate.nid]; + std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild()); + // Grow as needed + if (node_sum_gradients.size() <= max_nidx) { + node_sum_gradients.resize(max_nidx * 2 + 1); + } + node_sum_gradients[parent.LeftChild()] = candidate.split.left_sum; + node_sum_gradients[parent.RightChild()] = candidate.split.right_sum; - interaction_constraints.Split(candidate.nid, tree[candidate.nid].SplitIndex(), - tree[candidate.nid].LeftChild(), - tree[candidate.nid].RightChild()); + interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(), + parent.RightChild()); } GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) { From daa9b56fa41e9517ddb8edcb9f2d8afd2feb5394 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 19 May 2022 13:15:38 -0700 Subject: [PATCH 28/64] Comments, cleanup --- src/tree/gpu_hist/row_partitioner.cu | 2 +- src/tree/gpu_hist/row_partitioner.cuh | 157 ++++++++++-------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 4 +- 3 files changed, 94 insertions(+), 69 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index e13f6a8eb188..86642ab7170e 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -20,7 +20,7 @@ void Reset(int device_idx, common::Span ridx, } RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_tmp_(num_rows),position_(num_rows) { + : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_inputs_(num_rows),position_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(Segment(0, num_rows)); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 9d00e3528307..1cf753a59894 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -27,9 +27,9 @@ struct Segment { }; -const int kMaxBatch = 32; template -struct KernelArgs { +struct KernelBatchArgs { + static const int kMaxBatch = 8; Segment segments[kMaxBatch]; OpDataT data[kMaxBatch]; @@ -54,15 +54,29 @@ struct KernelArgs { } }; -// Should be 16 bytes aligned +// We can scan over this tuple, where the scan gives us information on how to partition inputs +// according to the flag struct IndexFlagTuple { - bst_uint idx; - bst_uint flag_scan; - bst_uint segment_start; - int16_t batch_idx; - bool flag; + bst_uint idx; // The location of the item we are working on in ridx_ + bst_uint flag_scan; // This gets populated after scanning + bst_uint segment_start; // Start offset of this node segment + int16_t batch_idx; // Which node in the batch does this item belong to + bool flag; // Result of op (is this item going left?) }; +struct IndexFlagOp { + __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { + // Segmented scan - resets if we cross batch boundaries + if (a.batch_idx == b.batch_idx) { + // Accumulate the flags, everything else stays the same + return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag}; + } else { + return b; + } + } +}; + + /*! \brief Count how many rows are assigned to left node. */ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment, int batch_idx) { @@ -83,10 +97,9 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo } template -void GetLeftCounts(const KernelArgs&args,common::Span ridx,common::Span scan_tmp, - common::Span d_left_counts, OpT op - ) { - +void GetLeftCounts(const KernelBatchArgs& args, common::Span ridx, + common::Span scan_inputs, + common::Span d_left_counts, OpT op) { // Launch 1 thread for each row dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) { // Assign this thread to a row @@ -94,22 +107,18 @@ void GetLeftCounts(const KernelArgs&args,common::Span ridx,c std::size_t item_idx; args.AssignBatch(idx, batch_idx, item_idx); auto op_res = op(ridx[item_idx], args.data[batch_idx]); - scan_tmp[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; + scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, + bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; - AtomicIncrement(d_left_counts.data(),op(ridx[item_idx], args.data[batch_idx]), batch_idx); + AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], args.data[batch_idx]), batch_idx); }); } -struct IndexFlagOp { - __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { - if (a.batch_idx == b.batch_idx) { - return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag}; - } else { - return b; - } - } -}; - +// This is a transformer output iterator +// It takes the result of the scan and performs the partition +// To understand how a scan is used to partition elements see: +// Harris, Mark, Shubhabrata Sengupta, and John D. Owens. "Parallel prefix sum (scan) with CUDA." +// GPU gems 3.39 (2007): 851-876. struct WriteResultsFunctor { bst_uint* ridx_in; bst_uint* ridx_out; @@ -132,10 +141,10 @@ struct WriteResultsFunctor { } }; -template -void SortPositionBatch(const KernelArgs& args, common::Span ridx, - common::Span ridx_tmp, common::Span scan_tmp, - common::Span left_counts, OpT op, +template +void SortPositionBatch(const KernelBatchArgs& args, common::Span ridx, + common::Span ridx_tmp, common::Span scan_inputs, + common::Span left_counts, cudaStream_t stream) { static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned."); WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()}; @@ -143,16 +152,16 @@ void SortPositionBatch(const KernelArgs& args, common::Span thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); auto counting = thrust::make_counting_iterator(0llu); size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_tmp.data(), + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_inputs.data(), discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); dh::TemporaryArray temp(temp_bytes); - cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, scan_tmp.data(), + cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, scan_inputs.data(), discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); // copy active segments back to original buffer dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) { - auto item_idx = scan_tmp[idx].idx; + auto item_idx = scan_inputs[idx].idx; ridx[item_idx] = ridx_tmp[item_idx]; }); } @@ -183,7 +192,7 @@ class RowPartitioner { dh::TemporaryArray ridx_; // Staging area for sorting ridx dh::TemporaryArray ridx_tmp_; - dh::TemporaryArray scan_tmp_; + dh::TemporaryArray scan_inputs_; /*! \brief mapping for row -> node id. */ dh::TemporaryArray position_; dh::PinnedMemory pinned_; @@ -226,43 +235,59 @@ class RowPartitioner { const std::vector& right_nidx, const std::vector& op_data, UpdatePositionOpT op) { if (nidx.empty()) return; - // Impose this limit because we are passing arguments for each node to the kernel by parameter - // this avoids memcpy but we cannot pass arbitrary number of arguments CHECK_EQ(nidx.size(), left_nidx.size()); CHECK_EQ(nidx.size(), right_nidx.size()); CHECK_EQ(nidx.size(), op_data.size()); - CHECK_LE(nidx.size(), kMaxBatch); - auto h_left_counts = pinned_.GetSpan(nidx.size(), 0); - dh::TemporaryArray d_left_counts(nidx.size(), 0); - - // Prepare kernel arguments - KernelArgs args; - std::copy(op_data.begin(), op_data.end(), args.data); - for (int i = 0; i < nidx.size(); i++) { - args.segments[i] = ridx_segments_.at(nidx[i]); - } - GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_tmp_), dh::ToSpan(d_left_counts), op); - - dh::safe_cuda( - cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(), - sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(), - cudaMemcpyDefault, streams_[0])); - - SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_tmp_), - dh::ToSpan(d_left_counts), op, streams_[1]); - - dh::safe_cuda(cudaStreamSynchronize(streams_[0])); - - // Update segments - for (int i = 0; i < nidx.size(); i++) { - auto segment = ridx_segments_.at(nidx[i]); - auto left_count = h_left_counts[i]; - CHECK_LE(left_count, segment.Size()); - CHECK_GE(left_count, 0); - ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), - std::max(left_nidx[i], right_nidx[i]) + 1)); - ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count); - ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end); + + // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying + // memory from device to host + for (std::size_t batch_start = 0; batch_start < nidx.size(); + batch_start += KernelBatchArgs::kMaxBatch) { + // Temporary arrays + auto h_left_counts = pinned_.GetSpan(KernelBatchArgs::kMaxBatch, 0); + dh::TemporaryArray d_left_counts(KernelBatchArgs::kMaxBatch, 0); + + std::size_t batch_end = std::min(batch_start + KernelBatchArgs::kMaxBatch, nidx.size()); + // Prepare kernel arguments + KernelBatchArgs args; + std::copy(op_data.begin() + batch_start, op_data.begin() + batch_end, args.data); + for (int i = 0; i < (batch_end - batch_start); i++) { + args.segments[i] = ridx_segments_.at(nidx[batch_start + i]); + } + + // Evaluate the operator for each row, where true means 'go left' + // Store the result of the operator for the next step + // Count the number of rows going left, store in d_left_counts + GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op); + + // Start copying the counts to the host + // We overlap this transfer with the sort step using streams + // We only need the result after sorting to update the segment boundaries + dh::safe_cuda( + cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(), + sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(), + cudaMemcpyDefault, streams_[0])); + + // Partition the rows according to the operator + SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_inputs_), + dh::ToSpan(d_left_counts), streams_[1]); + + dh::safe_cuda(cudaStreamSynchronize(streams_[0])); + + // Update segments + for (int i = 0; i < (batch_end - batch_start); i++) { + auto segment = ridx_segments_.at(nidx[batch_start + i]); + auto left_count = h_left_counts[i]; + CHECK_LE(left_count, segment.Size()); + CHECK_GE(left_count, 0); + ridx_segments_.resize( + std::max(static_cast(ridx_segments_.size()), + std::max(left_nidx[batch_start + i], right_nidx[batch_start + i]) + 1)); + ridx_segments_[left_nidx[batch_start + i]] = + Segment(segment.begin, segment.begin + left_count); + ridx_segments_[right_nidx[batch_start + i]] = + Segment(segment.begin + left_count, segment.end); + } } } diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 5ad2dbc3fc3a..22f17248b5f6 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -117,11 +117,11 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector op_data(segments.size()); - KernelArgs args; + KernelBatchArgs args; std::copy(segments.begin(), segments.end(), args.segments); std::copy(op_data.begin(), op_data.end(), args.data); GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(scan_tmp),dh::ToSpan(left_counts), op); - SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), op, + SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; From 8ab989e881f2b0dd133ab1f07267e3cc3d4ae5a7 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 20 May 2022 03:08:58 -0700 Subject: [PATCH 29/64] Refactor FinalizePosition --- src/tree/gpu_hist/row_partitioner.cuh | 16 +---- src/tree/updater_gpu_hist.cu | 98 ++++++++++++--------------- 2 files changed, 43 insertions(+), 71 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 1cf753a59894..20315719490b 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -171,7 +171,6 @@ void SortPositionBatch(const KernelBatchArgs& args, common::SpanSetDevice(ctx->gpu_id); p_out_position->Resize(position_.size()); auto sorted_position = p_out_position->DevicePointer(); @@ -331,7 +317,7 @@ class RowPartitioner { RowIndexT ridx = d_ridx[idx]; bst_node_t new_position = op(ridx, position); sorted_position[ridx] = sampledp(ridx) ? ~new_position : new_position; - if (new_position == kIgnoredTreePosition) { + if (new_position == -1) { return; } d_position[idx] = new_position; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index e72de08b203b..3d1e1ccb00ba 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -182,10 +182,11 @@ struct GPUHistMakerDevice { std::unique_ptr row_partitioner; DeviceHistogramStorage hist{}; - dh::caching_device_vector d_gpair; // storage for gpair; + dh::device_vector d_gpair; // storage for gpair; common::Span gpair; - dh::caching_device_vector monotone_constraints; + dh::device_vector monotone_constraints; + dh::device_vector update_predictions; /*! \brief Sum gradient for each node. */ std::vector node_sum_gradients; @@ -405,6 +406,16 @@ struct GPUHistMakerDevice { // prediction cache void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task, HostDeviceVector* p_out_position) { + if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) { + // see comment in the `FinalisePositionInPage`. + LOG(FATAL) << "Current objective function can not be used with subsampled external memory."; + } + + // External memory will not use prediction cache + if (!p_fmat->SingleColBlock()) { + return; + } + dh::TemporaryArray d_nodes(p_tree->GetNodes().size()); dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(), d_nodes.size() * sizeof(RegTree::Node), @@ -423,25 +434,9 @@ struct GPUHistMakerDevice { dh::CopyToD(categories_segments, &d_categories_segments); } - if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) { - row_partitioner.reset(); // Release the device memory first before reallocating - row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_)); - } - if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) { - // see comment in the `FinalisePositionInPage`. - LOG(FATAL) << "Current objective function can not be used with subsampled external memory."; - } - if (page->n_rows == p_fmat->Info().num_row_) { - FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), - dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task, - p_out_position); - } else { - for (auto const& batch : p_fmat->GetBatches(batch_param)) { - FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), - dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task, - p_out_position); - } - } + FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), + dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task, + p_out_position); } void FinalisePositionInPage(EllpackPageImpl const *page, @@ -453,13 +448,12 @@ struct GPUHistMakerDevice { HostDeviceVector* p_out_position) { auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_gpair = this->gpair; - row_partitioner->FinalisePosition( - ctx_, task, p_out_position, - [=] __device__(size_t row_id, int position) { + auto new_position_op = [=] __device__(size_t row_id) { // What happens if user prune the tree? if (!d_matrix.IsInRange(row_id)) { - return RowPartitioner::kIgnoredTreePosition; + return -1; } + int position = RegTree::kRoot; auto node = d_nodes[position]; while (!node.IsLeaf()) { @@ -487,41 +481,33 @@ struct GPUHistMakerDevice { } return position; - }, - [d_gpair] __device__(size_t ridx) { - // FIXME(jiamingy): Doesn't work when sampling is used with external memory as - // the sampler compacts the gradient vector. - return d_gpair[ridx].GetHess() - .0f == 0.f; - }); + }; + p_out_position->SetDevice(ctx_->gpu_id); + p_out_position->Resize(page->n_rows); + update_predictions.resize(page->n_rows); + auto d_update_predictions = dh::ToSpan(update_predictions); + auto sorted_position = p_out_position->DevicePointer(); + dh::LaunchN(page->n_rows, [=] __device__(size_t idx) { + bst_node_t position = new_position_op(idx); + d_update_predictions[idx]=d_nodes[position].LeafValue(); + // FIXME(jiamingy): Doesn't work when sampling is used with external memory as + // the sampler compacts the gradient vector. + bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f; + sorted_position[idx] = is_sampled? ~position : position; + }); } - void UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { + bool UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { CHECK(p_tree); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id); - auto d_ridx = row_partitioner->GetRows(); - - GPUTrainingParam param_d(param); - dh::TemporaryArray device_node_sum_gradients(node_sum_gradients.size()); - - dh::safe_cuda(cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(), - sizeof(GradientPairPrecise) * node_sum_gradients.size(), - cudaMemcpyHostToDevice)); - auto d_position = row_partitioner->GetPosition(); - auto d_node_sum_gradients = device_node_sum_gradients.data().get(); - auto tree_evaluator = evaluator_.GetEvaluator(); - - auto const& h_nodes = p_tree->GetNodes(); - dh::caching_device_vector nodes(h_nodes.size()); - dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(), - h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice)); - auto d_nodes = dh::ToSpan(nodes); - dh::LaunchN(d_ridx.size(), [=] XGBOOST_DEVICE(size_t idx) mutable { - bst_node_t nidx = d_position[idx]; - auto weight = d_nodes[nidx].LeafValue(); - out_preds_d(d_ridx[idx]) += weight; + auto d_update_predictions = dh::ToSpan(update_predictions); + if (d_update_predictions.empty()) return false; + CHECK_EQ(out_preds_d.Size(), d_update_predictions.size()); + dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable { + out_preds_d(idx) += d_update_predictions[idx]; }); - row_partitioner.reset(); + return true; } // num histograms is the number of contiguous histograms in memory to reduce over @@ -853,9 +839,9 @@ class GPUHistMaker : public TreeUpdater { return false; } monitor_.Start("UpdatePredictionCache"); - maker->UpdatePredictionCache(p_out_preds, p_last_tree_); + auto result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_); monitor_.Stop("UpdatePredictionCache"); - return true; + return result; } TrainParam param_; // NOLINT From d50ec4b442431f565958cbeb8eda75782337a44c Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 20 May 2022 03:21:36 -0700 Subject: [PATCH 30/64] Remove redundant functions --- src/tree/gpu_hist/row_partitioner.cu | 10 ---- src/tree/gpu_hist/row_partitioner.cuh | 44 -------------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 57 ------------------- 3 files changed, 111 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 86642ab7170e..da78b20317f6 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -53,9 +53,6 @@ common::Span RowPartitioner::GetRows() { return dh::ToSpan(ridx_); } -common::Span RowPartitioner::GetPosition() { - return dh::ToSpan(position_); -} std::vector RowPartitioner::GetRowsHost( bst_node_t nidx) { auto span = GetRows(nidx); @@ -64,12 +61,5 @@ std::vector RowPartitioner::GetRowsHost( return rows; } -std::vector RowPartitioner::GetPositionHost() { - auto span = GetPosition(); - std::vector position(span.size()); - dh::CopyDeviceSpanToVector(&position, span); - return position; -} - }; // namespace tree }; // namespace xgboost diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 20315719490b..487eb1fca5ec 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -213,21 +213,11 @@ class RowPartitioner { */ common::Span GetRows(); - /** - * \brief Gets the tree position of all training instances. - */ - common::Span GetPosition(); - /** * \brief Convenience method for testing */ std::vector GetRowsHost(bst_node_t nidx); - /** - * \brief Convenience method for testing - */ - std::vector GetPositionHost(); - template void UpdatePositionBatch(const std::vector& nidx, const std::vector& left_nidx, @@ -289,40 +279,6 @@ class RowPartitioner { } } } - - /** - * \brief Finalise the position of all training instances after tree construction is - * complete. Does not update any other meta information in this data structure, so - * should only be used at the end of training. - * - * When the task requires update leaf, this function will copy the node index into - * p_out_position. The index is negated if it's being sampled in current iteration. - * - * \param p_out_position Node index for each row. - * \param op Device lambda. Should provide the row index and current position as an - * argument and return the new position for this training instance. - * \param sampled A device lambda to inform the partitioner whether a row is sampled. - */ - template - void FinalisePosition(Context const* ctx, ObjInfo task, - HostDeviceVector* p_out_position, FinalisePositionOpT op, - Sampledp sampledp) { - auto d_position = position_.data().get(); - const auto d_ridx = ridx_.data().get(); - p_out_position->SetDevice(ctx->gpu_id); - p_out_position->Resize(position_.size()); - auto sorted_position = p_out_position->DevicePointer(); - dh::LaunchN(position_.size(), [=] __device__(size_t idx) { - auto position = d_position[idx]; - RowIndexT ridx = d_ridx[idx]; - bst_node_t new_position = op(ridx, position); - sorted_position[ridx] = sampledp(ridx) ? ~new_position : new_position; - if (new_position == -1) { - return; - } - d_position[idx] = new_position; - }); - } }; }; // namespace tree }; // namespace xgboost diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 22f17248b5f6..9861e79bba66 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -52,63 +52,6 @@ void TestUpdatePositionBatch() { TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); } -void TestFinalise() { - const int kNumRows = 10; - - ObjInfo task{ObjInfo::kRegression, false, false}; - HostDeviceVector position; - Context ctx; - ctx.gpu_id = 0; - - { - RowPartitioner rp(0, kNumRows); - rp.FinalisePosition( - &ctx, task, &position, - [=] __device__(RowPartitioner::RowIndexT ridx, int position) { return 7; }, - [] XGBOOST_DEVICE(size_t idx) { return false; }); - - auto position = rp.GetPositionHost(); - for (auto p : position) { - EXPECT_EQ(p, 7); - } - } - - /** - * Test for sampling. - */ - dh::device_vector hess(kNumRows); - for (size_t i = 0; i < hess.size(); ++i) { - // removed rows, 0, 3, 6, 9 - if (i % 3 == 0) { - hess[i] = 0; - } else { - hess[i] = i; - } - } - - auto d_hess = dh::ToSpan(hess); - - RowPartitioner rp(0, kNumRows); - rp.FinalisePosition( - &ctx, task, &position, - [] __device__(RowPartitioner::RowIndexT ridx, bst_node_t position) { - return ridx % 2 == 0 ? 1 : 2; - }, - [d_hess] __device__(size_t ridx) { return d_hess[ridx] - 0.f == 0.f; }); - - auto const& h_position = position.ConstHostVector(); - for (size_t ridx = 0; ridx < h_position.size(); ++ridx) { - if (ridx % 3 == 0) { - ASSERT_LT(h_position[ridx], 0); - } else { - ASSERT_EQ(h_position[ridx], ridx % 2 == 0 ? 1 : 2); - } - } -} - -TEST(RowPartitioner, Finalise) { TestFinalise(); } - - void TestSortPositionBatch(const std::vector& ridx_in, const std::vector& segments) { thrust::device_vector ridx = ridx_in; thrust::device_vector ridx_tmp(ridx_in.size()); From c34c3ad07fe753a608adb57fc6368f46f6ac2e39 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 20 May 2022 03:31:04 -0700 Subject: [PATCH 31/64] Lint --- src/tree/gpu_hist/row_partitioner.cu | 13 +---- src/tree/gpu_hist/row_partitioner.cuh | 2 - src/tree/updater_gpu_hist.cu | 69 ++++++++++++++------------- 3 files changed, 37 insertions(+), 47 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index da78b20317f6..b079189e6b7d 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -10,21 +10,12 @@ namespace xgboost { namespace tree { -void Reset(int device_idx, common::Span ridx, - common::Span position) { - CHECK_EQ(ridx.size(), position.size()); - dh::LaunchN(ridx.size(), [=] __device__(size_t idx) { - ridx[idx] = idx; - position[idx] = 0; - }); -} RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_inputs_(num_rows),position_(num_rows) { + : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows), scan_inputs_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(Segment(0, num_rows)); - - Reset(device_idx, dh::ToSpan(ridx_), dh::ToSpan(position_)); + thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); streams_.resize(2); for (auto& stream : streams_) { dh::safe_cuda(cudaStreamCreate(&stream)); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 487eb1fca5ec..17a24e9600db 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -192,8 +192,6 @@ class RowPartitioner { // Staging area for sorting ridx dh::TemporaryArray ridx_tmp_; dh::TemporaryArray scan_inputs_; - /*! \brief mapping for row -> node id. */ - dh::TemporaryArray position_; dh::PinnedMemory pinned_; std::vector streams_; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3d1e1ccb00ba..3d1c38ba51d7 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -383,7 +383,8 @@ struct GPUHistMakerDevice { auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); row_partitioner->UpdatePositionBatch( - nidx, left_nidx, right_nidx, split_data, [=] __device__(bst_uint ridx, const NodeSplitData& data) { + nidx, left_nidx, right_nidx, split_data, + [=] __device__(bst_uint ridx, const NodeSplitData& data) { // given a row index, returns the node id it belongs to bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex()); // Missing value @@ -392,7 +393,8 @@ struct GPUHistMakerDevice { go_left = data.split_node.DefaultLeft(); } else { if (data.split_type == FeatureType::kCategorical) { - go_left = common::Decision(data.node_cats.Bits(), cut_value, data.split_node.DefaultLeft()); + go_left = common::Decision(data.node_cats.Bits(), cut_value, + data.split_node.DefaultLeft()); } else { go_left = cut_value <= data.split_node.SplitCond(); } @@ -449,39 +451,38 @@ struct GPUHistMakerDevice { auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_gpair = this->gpair; auto new_position_op = [=] __device__(size_t row_id) { - // What happens if user prune the tree? - if (!d_matrix.IsInRange(row_id)) { - return -1; + // What happens if user prune the tree? + if (!d_matrix.IsInRange(row_id)) { + return -1; + } + int position = RegTree::kRoot; + auto node = d_nodes[position]; + + while (!node.IsLeaf()) { + bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex()); + // Missing value + if (isnan(element)) { + position = node.DefaultChild(); + } else { + bool go_left = true; + if (common::IsCat(d_feature_types, position)) { + auto node_cats = categories.subspan(categories_segments[position].beg, + categories_segments[position].size); + go_left = common::Decision(node_cats, element, node.DefaultLeft()); + } else { + go_left = element <= node.SplitCond(); } - int position = RegTree::kRoot; - auto node = d_nodes[position]; - - while (!node.IsLeaf()) { - bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex()); - // Missing value - if (isnan(element)) { - position = node.DefaultChild(); - } else { - bool go_left = true; - if (common::IsCat(d_feature_types, position)) { - auto node_cats = - categories.subspan(categories_segments[position].beg, - categories_segments[position].size); - go_left = common::Decision(node_cats, element, node.DefaultLeft()); - } else { - go_left = element <= node.SplitCond(); - } - if (go_left) { - position = node.LeftChild(); - } else { - position = node.RightChild(); - } - } - node = d_nodes[position]; + if (go_left) { + position = node.LeftChild(); + } else { + position = node.RightChild(); } + } + node = d_nodes[position]; + } - return position; - }; + return position; + }; // NOLINT p_out_position->SetDevice(ctx_->gpu_id); p_out_position->Resize(page->n_rows); update_predictions.resize(page->n_rows); @@ -489,11 +490,11 @@ struct GPUHistMakerDevice { auto sorted_position = p_out_position->DevicePointer(); dh::LaunchN(page->n_rows, [=] __device__(size_t idx) { bst_node_t position = new_position_op(idx); - d_update_predictions[idx]=d_nodes[position].LeafValue(); + d_update_predictions[idx] = d_nodes[position].LeafValue(); // FIXME(jiamingy): Doesn't work when sampling is used with external memory as // the sampler compacts the gradient vector. bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f; - sorted_position[idx] = is_sampled? ~position : position; + sorted_position[idx] = is_sampled ? ~position : position; }); } From 47bfc6e3e34dea03368d318225ba493d275ad0c8 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 20 May 2022 03:35:20 -0700 Subject: [PATCH 32/64] Remove old kernel --- src/common/device_helpers.cuh | 74 ----------------------------------- 1 file changed, 74 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 3fdb994fce3d..334e3b4f89bf 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1639,78 +1639,4 @@ class CUDAStream { CUDAStreamView View() const { return CUDAStreamView{stream_}; } void Sync() { this->View().Sync(); } }; - -struct PartitionScanPair { - int left; - int right; -}; - -inline __device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) { - PartitionScanPair c{a.left + b.left, a.right + b.right}; - return c; -} - -template -class BlockPartition{ - public: - template - __device__ int Partition(IterT begin, IterT end, OpT op) { - typedef cub::BlockScan BlockScanT; - __shared__ typename BlockScanT::TempStorage temp; - - __shared__ int16_t lcomp[kBlockSize*kItemsPerThread]; - __shared__ int16_t rcomp[kBlockSize*kItemsPerThread]; - __shared__ unsigned long long int tmp_sum; - - if (threadIdx.x == 0) { - tmp_sum = 0; - } - __syncthreads(); - - // Get left count - int count = end - begin; - int left_count = 0; - for (auto idx : dh::BlockStrideRange(int(0), count)) { - left_count += op(begin[idx]); - } - atomicAdd(&tmp_sum, left_count); - __syncthreads(); - left_count = tmp_sum; - - int loffset = 0, part = left_count, roffset = part; - auto tid = threadIdx.x; - while (loffset < part && roffset < count) { - // find the samples in the left that belong to right and vice-versa - auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread; - - PartitionScanPair flag[kItemsPerThread]; - for (int i = 0; i < kItemsPerThread; i++) { - flag[i].left = loff + i < part ? !op(begin[loff + i]) : 0; - flag[i].right = roff + i < count ? op(begin[roff + i]) : 0; - } - // scan to compute the locations for each 'misfit' in the two partitions - PartitionScanPair partial_sum[kItemsPerThread]; - PartitionScanPair sum; - BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum); - int minlen = sum.left < sum.right ? sum.left : sum.right; - // compaction to figure out the right locations to swap - for (int i = 0; i < kItemsPerThread; i++) { - if (flag[i].left) lcomp[partial_sum[i].left] = tid * kItemsPerThread+i; - if (flag[i].right) rcomp[partial_sum[i].right] = tid * kItemsPerThread+i; - } - __syncthreads(); - - // swap the 'misfit's - for (int i = tid; i < minlen; i += kBlockSize) { - auto a = begin[lcomp[i] + loffset]; - auto b = begin[rcomp[i] + roffset]; - begin[lcomp[i] + loffset] = b; - begin[rcomp[i] + roffset] = a; - } - loffset = sum.left == minlen ? loffset + kBlockSize * kItemsPerThread : loffset + lcomp[minlen]; - roffset = sum.right == minlen ? roffset + kBlockSize * kItemsPerThread : roffset + rcomp[minlen]; - } - return left_count; - } -}; } // namespace dh From a53ba8726e4cf03044dea995d1254b15b9af5f12 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 20 May 2022 04:03:07 -0700 Subject: [PATCH 33/64] Add tests for AtomicIncrement --- src/tree/gpu_hist/row_partitioner.cuh | 8 +- tests/cpp/common/test_device_helpers.cu | 79 ------------------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 22 ++++++ 3 files changed, 26 insertions(+), 83 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 17a24e9600db..dc7605305912 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -29,7 +29,7 @@ struct Segment { template struct KernelBatchArgs { - static const int kMaxBatch = 8; + static const int kMaxBatch = 32; Segment segments[kMaxBatch]; OpDataT data[kMaxBatch]; @@ -81,12 +81,12 @@ struct IndexFlagOp { __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment, int batch_idx) { int mask = __activemask(); - bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, 0)); + int leader = __ffs(mask) - 1; + bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader)); // If all threads here are working on the same node // we can do a more efficient reduction with warp intrinsics if (group_is_contiguous) { unsigned ballot = __ballot_sync(mask, increment); - int leader = __ffs(mask) - 1; if (threadIdx.x % 32 == leader) { atomicAdd(d_counts + batch_idx, // NOLINT __popc(ballot)); // NOLINT @@ -197,7 +197,7 @@ class RowPartitioner { public: RowPartitioner(int device_idx, size_t num_rows); - ~RowPartitioner(); + ~RowPartitioner(); RowPartitioner(const RowPartitioner&) = delete; RowPartitioner& operator=(const RowPartitioner&) = delete; diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu index 18fdb5b7eb34..6e8668bd2581 100644 --- a/tests/cpp/common/test_device_helpers.cu +++ b/tests/cpp/common/test_device_helpers.cu @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include "../../../src/common/device_helpers.cuh" @@ -265,82 +264,4 @@ void TestAtomicAdd() { TEST(AtomicAdd, Int64) { TestAtomicAdd(); } - -template -__global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) { - auto count = dh::BlockPartition().Partition(begin, end, op); - if (threadIdx.x == 0) { - *count_out = count; - } -} - -template -void TestBlockPartition(thrust::device_vector& x) { - thrust::device_vector count(1); - - auto op = [] __device__(int y) { return y % 2 == 0; }; - TestBlockPartitionKernel - <<<1, kBlockSize>>>(x.data().get(), x.data().get() + x.size(), count.data().get(), op); - - auto reference = thrust::count_if(x.begin(), x.end(), op); - EXPECT_EQ(count[0], reference); - - auto left_partition_count = thrust::count_if(x.begin(), x.begin() + count[0], op); - EXPECT_EQ(count[0], left_partition_count); - auto right_partition_count = thrust::count_if(x.begin() + count[0], x.end(), op); - EXPECT_EQ(0, right_partition_count); -} - -TEST(BlockPartition, BlockPartitionEmpty) { - thrust::device_vector x; - TestBlockPartition<256>(x); -} - -TEST(BlockPartition, BlockPartitionUniform) { - thrust::device_vector x(100); - TestBlockPartition<256>(x); - thrust::fill(x.begin(),x.end(),1); - TestBlockPartition<256>(x); -} - -void MakeRandom(thrust::device_vector& x, int seed) { - auto counting = thrust::make_counting_iterator(0); - thrust::transform(counting, counting + x.size(), x.begin(), [=] __device__(auto idx) { - thrust::default_random_engine gen(seed); - thrust::uniform_int_distribution dist; - gen.discard(idx); - return dist(gen); - }); -} - -TEST(BlockPartition, BlockPartitionBasic) { - thrust::device_vector x = std::vector{0,1,2}; - TestBlockPartition<256>(x); -} - -TEST(BlockPartition, BlockPartition) { - int sizes[] = {1, 37, 1092}; - int seeds[] = {0, 1, 2, 3, 4}; - for (auto seed : seeds) { - for (auto size : sizes) { - thrust::device_vector x(size); - MakeRandom(x, seed); - thrust::device_vector y = x; - TestBlockPartition<1>(y); - y = x; - TestBlockPartition<1024>(y); - y = x; - TestBlockPartition<37>(y); - } - } -} - -TEST(BlockPartition, BlockPartitionBenchmark) { - for (int i = 0; i < 20; i++) { - thrust::device_vector x(10000000); - MakeRandom(x, i); - TestBlockPartition<1024>(x); - } -} - } // namespace xgboost diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 9861e79bba66..a8672dc2ec1b 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -87,5 +87,27 @@ TEST(GpuHist, SortPositionBatch) { TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}}); } +void TestAtomicIncrement(const std::vector& group_in, const std::vector& increment_in) { + thrust::device_vector group(group_in); + thrust::device_vector increment(increment_in); + thrust::device_vector reference(group_in.size()); + thrust::device_vector result(group_in.size()); + + auto d_group = group.data().get(); + auto d_increment = increment.data().get(); + auto d_reference = reference.data().get(); + auto d_result = result.data().get(); + dh::LaunchN(group.size(), [=] __device__(std::size_t idx) { + AtomicIncrement(d_result, d_increment[idx], d_group[idx]); + atomicAdd(d_reference + d_group[idx], d_increment[idx]); + }); + + EXPECT_EQ(reference, result); +} + +TEST(GpuHist, AtomicIncrement) { + TestAtomicIncrement({0, 0, 0}, {1, 0, 1}); + TestAtomicIncrement({0, 0, 1}, {1, 0, 1}); +} } // namespace tree } // namespace xgboost From 7450d68bbe5997db50ae8af56f47d267d7e8de8e Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 23 May 2022 04:33:42 -0700 Subject: [PATCH 34/64] Change lambda to kernel --- src/tree/gpu_hist/row_partitioner.cuh | 52 +++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index dc7605305912..95fe3d3454db 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -32,9 +32,8 @@ struct KernelBatchArgs { static const int kMaxBatch = 32; Segment segments[kMaxBatch]; OpDataT data[kMaxBatch]; - // Given a global thread idx, assign it to an item from one of the segments - __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) const { + __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) { std::size_t sum = 0; for (int16_t i = 0; i < kMaxBatch; i++) { if (sum + segments[i].Size() > idx) { @@ -96,22 +95,63 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo } } + +template +__global__ void GetLeftCountsKernel(KernelBatchArgs args, common::Span ridx, + common::Span scan_inputs, + common::Span d_left_counts, OpT op, std::size_t n){ + + __shared__ KernelBatchArgs s_args; + + for (int i = threadIdx.x; i < sizeof(KernelBatchArgs); i += kBlockSize) { + reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; + } + __syncthreads(); + // Assign this thread to a row + std::size_t idx = blockIdx.x *blockDim.x + threadIdx.x; + if (idx >= n) return; + int16_t batch_idx; + std::size_t item_idx; + s_args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); + scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, + bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res}; + + AtomicIncrement(d_left_counts.data(), op_res, batch_idx); +} + + template void GetLeftCounts(const KernelBatchArgs& args, common::Span ridx, common::Span scan_inputs, common::Span d_left_counts, OpT op) { // Launch 1 thread for each row - dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) { + constexpr int kBlockSize = 256; + const int grid_size = + static_cast(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize)); + +GetLeftCountsKernel<<>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows()); + +/* + dh::LaunchN<1, kBlockSize>(args.TotalRows(), [=] __device__(std::size_t idx) { + __shared__ KernelBatchArgs s_args; + + for (int i = threadIdx.x; i < sizeof(KernelBatchArgs); i += kBlockSize) { + reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; + } + __syncthreads(); + // Assign this thread to a row int16_t batch_idx; std::size_t item_idx; - args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], args.data[batch_idx]); + s_args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; - AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], args.data[batch_idx]), batch_idx); + AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], s_args.data[batch_idx]), batch_idx); }); + */ } // This is a transformer output iterator From 6df1259f375d7d5ca26eaed005d583c7eb989c0f Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 24 May 2022 04:46:42 -0700 Subject: [PATCH 35/64] Smem + lineinfo --- cmake/Utils.cmake | 1 + src/tree/gpu_hist/row_partitioner.cuh | 33 ++++++++++++++------------- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 963c494ccf26..6c124d625f3a 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -136,6 +136,7 @@ function(xgboost_set_cuda_flags target) target_compile_options(${target} PRIVATE $<$:--expt-extended-lambda> $<$:--expt-relaxed-constexpr> + $<$:-lineinfo> $<$:${GEN_CODE}> $<$:-Xcompiler=${OpenMP_CXX_FLAGS}> $<$:-Xfatbin=-compress-all>) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 95fe3d3454db..a8fb15b97752 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -97,27 +97,27 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo template -__global__ void GetLeftCountsKernel(KernelBatchArgs args, common::Span ridx, +__global__ void GetLeftCountsKernel(const KernelBatchArgs args, common::Span ridx, common::Span scan_inputs, common::Span d_left_counts, OpT op, std::size_t n){ __shared__ KernelBatchArgs s_args; - for (int i = threadIdx.x; i < sizeof(KernelBatchArgs); i += kBlockSize) { - reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; + for (int i = threadIdx.x; i < sizeof(KernelBatchArgs)/8; i += kBlockSize) { + reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; } __syncthreads(); - // Assign this thread to a row - std::size_t idx = blockIdx.x *blockDim.x + threadIdx.x; - if (idx >= n) return; - int16_t batch_idx; - std::size_t item_idx; - s_args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); - scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, - bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res}; - - AtomicIncrement(d_left_counts.data(), op_res, batch_idx); + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) { + int16_t batch_idx; + std::size_t item_idx; + s_args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); + scan_inputs[idx] = + IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(s_args.segments[batch_idx].begin), + batch_idx, op_res}; + + AtomicIncrement(d_left_counts.data(), op_res, batch_idx); + } } @@ -128,9 +128,10 @@ void GetLeftCounts(const KernelBatchArgs& args, common::Span // Launch 1 thread for each row constexpr int kBlockSize = 256; const int grid_size = - static_cast(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize)); + std::max(256,static_cast(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize))); + -GetLeftCountsKernel<<>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows()); + GetLeftCountsKernel<<>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows()); /* dh::LaunchN<1, kBlockSize>(args.TotalRows(), [=] __device__(std::size_t idx) { From 40109427c1490c9426c9702c93747107b8436499 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 24 May 2022 05:51:57 -0700 Subject: [PATCH 36/64] Use stream --- src/tree/gpu_hist/row_partitioner.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index a8fb15b97752..a0c5be5c6502 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -201,7 +201,7 @@ void SortPositionBatch(const KernelBatchArgs& args, common::Span Date: Wed, 25 May 2022 07:46:20 -0700 Subject: [PATCH 37/64] Fast global stores --- src/tree/gpu_hist/row_partitioner.cuh | 76 ++++++++++++++++++++------- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index a0c5be5c6502..d806c8e87ca6 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -95,31 +95,67 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo } } - template -__global__ void GetLeftCountsKernel(const KernelBatchArgs args, common::Span ridx, - common::Span scan_inputs, - common::Span d_left_counts, OpT op, std::size_t n){ +__global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( + const KernelBatchArgs args, common::Span ridx, + common::Span scan_inputs, common::Span d_left_counts, + OpT op, std::size_t n) { + // Load this large struct in shared memory + // if left to its own devices the compiler loads this very slowly + __shared__ KernelBatchArgs s_args; + + for (int i = threadIdx.x; i < sizeof(KernelBatchArgs) / 8; i += kBlockSize) { + reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; + } + __syncthreads(); - __shared__ KernelBatchArgs s_args; + // Global writes of IndexFlagTuple are inefficient due to its 16b size + // we can use cub to optimise this + static_assert(sizeof(IndexFlagTuple) == 16, "Expected IndexFlagTuple to be 16 bytes."); + constexpr int kTupleWords = sizeof(IndexFlagTuple)/sizeof(int); + typedef cub::BlockStore BlockStoreT; + __shared__ typename BlockStoreT::TempStorage temp_storage; - for (int i = threadIdx.x; i < sizeof(KernelBatchArgs)/8; i += kBlockSize) { - reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; - } - __syncthreads(); - for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) { - int16_t batch_idx; - std::size_t item_idx; - s_args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); - scan_inputs[idx] = - IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(s_args.segments[batch_idx].begin), - batch_idx, op_res}; - - AtomicIncrement(d_left_counts.data(), op_res, batch_idx); + // Use the raw pointer because the performance of global writes matters here + // We don't really need the bounds checking + IndexFlagTuple* out_ptr = scan_inputs.data(); + + auto get_tuple = [=]__device__ (auto idx){ + int16_t batch_idx; + std::size_t item_idx; + s_args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); + AtomicIncrement(d_left_counts.data(), op_res, batch_idx); + return IndexFlagTuple{bst_uint(item_idx), op_res, + bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res}; + }; + + // Process full tiles + std::size_t tile_offset = blockIdx.x * kBlockSize; + while (tile_offset + kBlockSize <= n) { + std::size_t idx = tile_offset + threadIdx.x; + auto tuple = get_tuple(idx); + auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); + BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast(static_cast(&tuple))); + tile_offset += kBlockSize * gridDim.x; + } + + // Process partial tile + if (tile_offset < n) { + // Make sure we don't compute a negative number with unsigned integers + int valid_items = int(int64_t(n) - int64_t(tile_offset)); + std::size_t idx = tile_offset + threadIdx.x; + IndexFlagTuple tuple; + if (idx < n) { + tuple = get_tuple(idx); } -} + auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); + BlockStoreT(temp_storage) + .Store(block_write_ptr, *static_cast(static_cast(&tuple)), + valid_items * kTupleWords); + } +} template void GetLeftCounts(const KernelBatchArgs& args, common::Span ridx, From 24fb339c46bd7d4f9070c16d4951f03f74e96b71 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 27 May 2022 06:06:25 -0700 Subject: [PATCH 38/64] Fast load without shmem --- src/tree/gpu_hist/row_partitioner.cuh | 39 ++++++++++++++----- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 2 +- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index d806c8e87ca6..a16989a165bd 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -26,14 +26,15 @@ struct Segment { __host__ __device__ size_t Size() const { return end - begin; } }; - template struct KernelBatchArgs { static const int kMaxBatch = 32; Segment segments[kMaxBatch]; OpDataT data[kMaxBatch]; + + KernelBatchArgs() = default; // Given a global thread idx, assign it to an item from one of the segments - __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) { + __device__ void AssignBatch(std::size_t idx, int16_t& batch_idx, std::size_t& item_idx) const { std::size_t sum = 0; for (int16_t i = 0; i < kMaxBatch; i++) { if (sum + segments[i].Size() > idx) { @@ -95,6 +96,17 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo } } +template +__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span ridx, common::Span d_left_counts,const KernelBatchArgs &args, OpT op){ + int16_t batch_idx; + std::size_t item_idx; + args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], args.data[batch_idx]); + AtomicIncrement(d_left_counts.data(), op_res, batch_idx); + return IndexFlagTuple{bst_uint(item_idx), op_res, + bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; +} + template __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( const KernelBatchArgs args, common::Span ridx, @@ -102,12 +114,16 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( OpT op, std::size_t n) { // Load this large struct in shared memory // if left to its own devices the compiler loads this very slowly - __shared__ KernelBatchArgs s_args; - - for (int i = threadIdx.x; i < sizeof(KernelBatchArgs) / 8; i += kBlockSize) { - reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; + //__shared__ KernelBatchArgs s_args; + /* + __shared__ cub::Uninitialized> s_temp; + KernelBatchArgs& s_args = s_temp.Alias(); + for (int i = threadIdx.x; i < sizeof(KernelBatchArgs) / 4; i += kBlockSize) { + reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; } + __syncthreads(); + */ // Global writes of IndexFlagTuple are inefficient due to its 16b size // we can use cub to optimise this @@ -120,7 +136,8 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( // We don't really need the bounds checking IndexFlagTuple* out_ptr = scan_inputs.data(); - auto get_tuple = [=]__device__ (auto idx){ + /* + auto get_tuple = [&]__device__ (auto idx){ int16_t batch_idx; std::size_t item_idx; s_args.AssignBatch(idx, batch_idx, item_idx); @@ -129,12 +146,14 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( return IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res}; }; + */ // Process full tiles std::size_t tile_offset = blockIdx.x * kBlockSize; while (tile_offset + kBlockSize <= n) { std::size_t idx = tile_offset + threadIdx.x; - auto tuple = get_tuple(idx); + //auto tuple = get_tuple(idx); + auto tuple = GetTuple(idx,ridx,d_left_counts,args,op); auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast(static_cast(&tuple))); tile_offset += kBlockSize * gridDim.x; @@ -147,7 +166,8 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( std::size_t idx = tile_offset + threadIdx.x; IndexFlagTuple tuple; if (idx < n) { - tuple = get_tuple(idx); + tuple = GetTuple(idx,ridx,d_left_counts,args,op); + //tuple = get_tuple(idx); } auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); @@ -225,6 +245,7 @@ void SortPositionBatch(const KernelBatchArgs& args, common::Span(), write_results); auto counting = thrust::make_counting_iterator(0llu); diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index a8672dc2ec1b..0fded6828236 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -64,7 +64,7 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector Date: Mon, 30 May 2022 04:59:35 -0700 Subject: [PATCH 39/64] Memcpy version --- src/tree/gpu_hist/row_partitioner.cuh | 121 ++++++++++++------ .../cpp/tree/gpu_hist/test_row_partitioner.cu | 18 ++- 2 files changed, 94 insertions(+), 45 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index a16989a165bd..3d47b986f240 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -54,6 +54,29 @@ struct KernelBatchArgs { } }; +template +struct KernelMemcpyArgs { + Segment segment; + OpDataT data; +}; + +template +__device__ void AssignBatch(const common::Span> batch_info, + std::size_t idx, int16_t& batch_idx, std::size_t& item_idx, OpDataT&data) { + const auto ptr = batch_info.data(); + std::size_t sum = 0; + + for (int16_t i = 0; i < batch_info.size(); i++) { + if (sum + ptr[i].segment.Size() > idx) { + batch_idx = i; + item_idx = (idx - sum) + ptr[i].segment.begin; + data = ptr[i].data; + break; + } + sum += ptr[i].segment.Size(); + } +} + // We can scan over this tuple, where the scan gives us information on how to partition inputs // according to the flag struct IndexFlagTuple { @@ -97,11 +120,13 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo } template -__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span ridx, common::Span d_left_counts,const KernelBatchArgs &args, OpT op){ +__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span ridx, common::Span d_left_counts,const KernelBatchArgs &args, const common::Span> batch_info, OpT op){ int16_t batch_idx; std::size_t item_idx; - args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], args.data[batch_idx]); + OpDataT data; + AssignBatch(batch_info,idx, batch_idx, item_idx, data); + //args.AssignBatch(idx, batch_idx, item_idx); + auto op_res = op(ridx[item_idx], data); AtomicIncrement(d_left_counts.data(), op_res, batch_idx); return IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; @@ -109,7 +134,8 @@ __device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span template __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( - const KernelBatchArgs args, common::Span ridx, + const KernelBatchArgs args, const common::Span> batch_info, + common::Span ridx, common::Span scan_inputs, common::Span d_left_counts, OpT op, std::size_t n) { // Load this large struct in shared memory @@ -121,7 +147,7 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( for (int i = threadIdx.x; i < sizeof(KernelBatchArgs) / 4; i += kBlockSize) { reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; } - + __syncthreads(); */ @@ -153,9 +179,9 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( while (tile_offset + kBlockSize <= n) { std::size_t idx = tile_offset + threadIdx.x; //auto tuple = get_tuple(idx); - auto tuple = GetTuple(idx,ridx,d_left_counts,args,op); + auto tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op); auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); - BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast(static_cast(&tuple))); + //BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast(static_cast(&tuple))); tile_offset += kBlockSize * gridDim.x; } @@ -166,20 +192,23 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( std::size_t idx = tile_offset + threadIdx.x; IndexFlagTuple tuple; if (idx < n) { - tuple = GetTuple(idx,ridx,d_left_counts,args,op); + tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op); //tuple = get_tuple(idx); } + /* auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); BlockStoreT(temp_storage) .Store(block_write_ptr, *static_cast(static_cast(&tuple)), valid_items * kTupleWords); + */ } } template -void GetLeftCounts(const KernelBatchArgs& args, common::Span ridx, - common::Span scan_inputs, +void GetLeftCounts(const KernelBatchArgs& args, + const common::Span> batch_info, + common::Span ridx, common::Span scan_inputs, common::Span d_left_counts, OpT op) { // Launch 1 thread for each row constexpr int kBlockSize = 256; @@ -187,28 +216,7 @@ void GetLeftCounts(const KernelBatchArgs& args, common::Span std::max(256,static_cast(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize))); - GetLeftCountsKernel<<>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows()); - -/* - dh::LaunchN<1, kBlockSize>(args.TotalRows(), [=] __device__(std::size_t idx) { - __shared__ KernelBatchArgs s_args; - - for (int i = threadIdx.x; i < sizeof(KernelBatchArgs); i += kBlockSize) { - reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; - } - __syncthreads(); - - // Assign this thread to a row - int16_t batch_idx; - std::size_t item_idx; - s_args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); - scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, - bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; - - AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], s_args.data[batch_idx]), batch_idx); - }); - */ + GetLeftCountsKernel<<>>(args, batch_info,ridx, scan_inputs,d_left_counts,op, args.TotalRows()); } // This is a transformer output iterator @@ -238,28 +246,43 @@ struct WriteResultsFunctor { } }; -template -void SortPositionBatch(const KernelBatchArgs& args, common::Span ridx, +template +void SortPositionBatch(const KernelBatchArgs& args, + const common::Span> batch_info, + common::Span ridx, common::Span ridx_tmp, common::Span scan_inputs, - common::Span left_counts, - cudaStream_t stream) { + common::Span left_counts, OpT op,cudaStream_t stream) { static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned."); WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()}; auto discard_write_iterator = thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); auto counting = thrust::make_counting_iterator(0llu); + auto input_iterator = + dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { + int16_t batch_idx; + std::size_t item_idx; + OpDataT data; + AssignBatch(batch_info, idx, batch_idx, item_idx, data); + auto op_res = op(ridx[item_idx], data); + return IndexFlagTuple{bst_uint(item_idx), op_res, + bst_uint(batch_info.data()[batch_idx].segment.begin), batch_idx, + op_res}; + }); size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_inputs.data(), + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); dh::TemporaryArray temp(temp_bytes); - cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, scan_inputs.data(), + cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); // copy active segments back to original buffer dh::LaunchN(args.TotalRows(), stream, [=] __device__(std::size_t idx) { - auto item_idx = scan_inputs[idx].idx; + int16_t batch_idx; + std::size_t item_idx; + OpDataT data; + AssignBatch(batch_info, idx, batch_idx, item_idx, data); ridx[item_idx] = ridx_tmp[item_idx]; }); } @@ -291,6 +314,7 @@ class RowPartitioner { dh::TemporaryArray ridx_tmp_; dh::TemporaryArray scan_inputs_; dh::PinnedMemory pinned_; + dh::PinnedMemory pinned2_; std::vector streams_; public: @@ -324,6 +348,18 @@ class RowPartitioner { CHECK_EQ(nidx.size(), right_nidx.size()); CHECK_EQ(nidx.size(), op_data.size()); + auto h_batch_info = pinned2_.GetSpan>(nidx.size()); + dh::TemporaryArray> d_batch_info(nidx.size()); + + std::size_t total_rows = 0; + for (int i = 0; i < nidx.size(); i++) { + h_batch_info[i] = {ridx_segments_.at(nidx.at(i)), op_data.at(i)}; + total_rows += ridx_segments_.at(nidx.at(i)).Size(); + } + dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), + h_batch_info.size() * sizeof(KernelMemcpyArgs), + cudaMemcpyDefault, streams_[0])); + // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying // memory from device to host for (std::size_t batch_start = 0; batch_start < nidx.size(); @@ -343,7 +379,8 @@ class RowPartitioner { // Evaluate the operator for each row, where true means 'go left' // Store the result of the operator for the next step // Count the number of rows going left, store in d_left_counts - GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op); + GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), + dh::ToSpan(d_left_counts), op); // Start copying the counts to the host // We overlap this transfer with the sort step using streams @@ -354,8 +391,8 @@ class RowPartitioner { cudaMemcpyDefault, streams_[0])); // Partition the rows according to the operator - SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_inputs_), - dh::ToSpan(d_left_counts), streams_[1]); + SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), + dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op,streams_[1]); dh::safe_cuda(cudaStreamSynchronize(streams_[0])); diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 0fded6828236..c29032c5790d 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -60,12 +60,24 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector op_data(segments.size()); + std::vector> h_batch_info(segments.size()); + dh::TemporaryArray> d_batch_info(segments.size()); + + std::size_t total_rows = 0; + for (int i = 0; i < segments.size(); i++) { + h_batch_info[i] = {segments.at(i), 0}; + total_rows += segments.at(i).Size(); + } + dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), + h_batch_info.size() * sizeof(KernelMemcpyArgs), + cudaMemcpyDefault, nullptr)); KernelBatchArgs args; std::copy(segments.begin(), segments.end(), args.segments); std::copy(op_data.begin(), op_data.end(), args.data); - GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(scan_tmp),dh::ToSpan(left_counts), op); - SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), - nullptr); + GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(scan_tmp), + dh::ToSpan(left_counts), op); + SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), + dh::ToSpan(scan_tmp), dh::ToSpan(left_counts),op, nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; for (int i = 0; i < segments.size(); i++) { From 7d5d7e71e93d765a14d9c67df21d2e3c9c045758 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 30 May 2022 08:32:42 -0700 Subject: [PATCH 40/64] Remove left counts kernel --- src/tree/gpu_hist/row_partitioner.cuh | 47 +++++++++++++++------------ 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 3d47b986f240..05debc22e603 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -83,6 +83,7 @@ struct IndexFlagTuple { bst_uint idx; // The location of the item we are working on in ridx_ bst_uint flag_scan; // This gets populated after scanning bst_uint segment_start; // Start offset of this node segment + bst_uint segment_end; // End offset of this node segment int16_t batch_idx; // Which node in the batch does this item belong to bool flag; // Result of op (is this item going left?) }; @@ -92,7 +93,7 @@ struct IndexFlagOp { // Segmented scan - resets if we cross batch boundaries if (a.batch_idx == b.batch_idx) { // Accumulate the flags, everything else stays the same - return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag}; + return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.segment_end,b.batch_idx, b.flag}; } else { return b; } @@ -129,7 +130,7 @@ __device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span auto op_res = op(ridx[item_idx], data); AtomicIncrement(d_left_counts.data(), op_res, batch_idx); return IndexFlagTuple{bst_uint(item_idx), op_res, - bst_uint(args.segments[batch_idx].begin), batch_idx, op_res}; + bst_uint(args.segments[batch_idx].begin),bst_uint(args.segments[batch_idx].end), batch_idx, op_res}; } template @@ -153,7 +154,6 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( // Global writes of IndexFlagTuple are inefficient due to its 16b size // we can use cub to optimise this - static_assert(sizeof(IndexFlagTuple) == 16, "Expected IndexFlagTuple to be 16 bytes."); constexpr int kTupleWords = sizeof(IndexFlagTuple)/sizeof(int); typedef cub::BlockStore BlockStoreT; __shared__ typename BlockStoreT::TempStorage temp_storage; @@ -181,7 +181,7 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( //auto tuple = get_tuple(idx); auto tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op); auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); - //BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast(static_cast(&tuple))); + BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast(static_cast(&tuple))); tile_offset += kBlockSize * gridDim.x; } @@ -196,12 +196,10 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( //tuple = get_tuple(idx); } - /* auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); BlockStoreT(temp_storage) .Store(block_write_ptr, *static_cast(static_cast(&tuple)), valid_items * kTupleWords); - */ } } @@ -234,13 +232,23 @@ struct WriteResultsFunctor { // node so far during scan. std::size_t scatter_address; if (x.flag) { - scatter_address = x.segment_start + x.flag_scan - 1; // -1 because inclusive scan + bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan + scatter_address = x.segment_start + num_previous_flagged; } else { + + bst_uint num_previous_unflagged = (x.idx - x.segment_start) - x.flag_scan; // current number of rows belong to right node + total number of rows // belong to left node - scatter_address = (x.idx - x.flag_scan) + left_counts[x.batch_idx]; + // scatter_address = (x.idx - x.flag_scan) + left_counts[x.batch_idx]; + scatter_address = x.segment_end - num_previous_unflagged - 1; } ridx_out[scatter_address] = ridx_in[x.idx]; + + if (x.idx == (x.segment_end - 1)) { + // Write out counts + left_counts[x.batch_idx] = x.flag_scan; + } + // Discard return {}; } @@ -252,7 +260,7 @@ void SortPositionBatch(const KernelBatchArgs& args, common::Span ridx, common::Span ridx_tmp, common::Span scan_inputs, common::Span left_counts, OpT op,cudaStream_t stream) { - static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned."); + //static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned."); WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()}; auto discard_write_iterator = @@ -266,7 +274,7 @@ void SortPositionBatch(const KernelBatchArgs& args, AssignBatch(batch_info, idx, batch_idx, item_idx, data); auto op_res = op(ridx[item_idx], data); return IndexFlagTuple{bst_uint(item_idx), op_res, - bst_uint(batch_info.data()[batch_idx].segment.begin), batch_idx, + bst_uint(batch_info.data()[batch_idx].segment.begin),bst_uint(batch_info.data()[batch_idx].segment.end), batch_idx, op_res}; }); size_t temp_bytes = 0; @@ -358,7 +366,7 @@ class RowPartitioner { } dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), h_batch_info.size() * sizeof(KernelMemcpyArgs), - cudaMemcpyDefault, streams_[0])); + cudaMemcpyDefault, streams_[1])); // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying // memory from device to host @@ -379,22 +387,19 @@ class RowPartitioner { // Evaluate the operator for each row, where true means 'go left' // Store the result of the operator for the next step // Count the number of rows going left, store in d_left_counts - GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), - dh::ToSpan(d_left_counts), op); + //GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), + //dh::ToSpan(d_left_counts), op); - // Start copying the counts to the host - // We overlap this transfer with the sort step using streams - // We only need the result after sorting to update the segment boundaries - dh::safe_cuda( - cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(), - sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(), - cudaMemcpyDefault, streams_[0])); // Partition the rows according to the operator SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op,streams_[1]); + dh::safe_cuda( + cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(), + sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(), + cudaMemcpyDefault, streams_[1])); - dh::safe_cuda(cudaStreamSynchronize(streams_[0])); + dh::safe_cuda(cudaStreamSynchronize(streams_[1])); // Update segments for (int i = 0; i < (batch_end - batch_start); i++) { From 77f85504d583c04f1328158125d65b2c5fffd3bd Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 31 May 2022 03:02:37 -0700 Subject: [PATCH 41/64] Unstable partition --- src/tree/gpu_hist/row_partitioner.cu | 2 +- src/tree/gpu_hist/row_partitioner.cuh | 280 ++++++------------ .../cpp/tree/gpu_hist/test_row_partitioner.cu | 19 +- 3 files changed, 95 insertions(+), 206 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index b079189e6b7d..7676c8e67495 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -12,7 +12,7 @@ namespace xgboost { namespace tree { RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows), scan_inputs_(num_rows) { + : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(Segment(0, num_rows)); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 05debc22e603..f062e2a4ed48 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -26,33 +26,7 @@ struct Segment { __host__ __device__ size_t Size() const { return end - begin; } }; -template -struct KernelBatchArgs { - static const int kMaxBatch = 32; - Segment segments[kMaxBatch]; - OpDataT data[kMaxBatch]; - - KernelBatchArgs() = default; - // Given a global thread idx, assign it to an item from one of the segments - __device__ void AssignBatch(std::size_t idx, int16_t& batch_idx, std::size_t& item_idx) const { - std::size_t sum = 0; - for (int16_t i = 0; i < kMaxBatch; i++) { - if (sum + segments[i].Size() > idx) { - batch_idx = i; - item_idx = (idx - sum) + segments[i].begin; - break; - } - sum += segments[i].Size(); - } - } - std::size_t TotalRows() const { - std::size_t total_rows = 0; - for (auto segment : segments) { - total_rows += segment.Size(); - } - return total_rows; - } -}; +using PartitionCountsT = thrust::pair; template struct KernelMemcpyArgs { @@ -120,103 +94,6 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo } } -template -__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span ridx, common::Span d_left_counts,const KernelBatchArgs &args, const common::Span> batch_info, OpT op){ - int16_t batch_idx; - std::size_t item_idx; - OpDataT data; - AssignBatch(batch_info,idx, batch_idx, item_idx, data); - //args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], data); - AtomicIncrement(d_left_counts.data(), op_res, batch_idx); - return IndexFlagTuple{bst_uint(item_idx), op_res, - bst_uint(args.segments[batch_idx].begin),bst_uint(args.segments[batch_idx].end), batch_idx, op_res}; -} - -template -__global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel( - const KernelBatchArgs args, const common::Span> batch_info, - common::Span ridx, - common::Span scan_inputs, common::Span d_left_counts, - OpT op, std::size_t n) { - // Load this large struct in shared memory - // if left to its own devices the compiler loads this very slowly - //__shared__ KernelBatchArgs s_args; - /* - __shared__ cub::Uninitialized> s_temp; - KernelBatchArgs& s_args = s_temp.Alias(); - for (int i = threadIdx.x; i < sizeof(KernelBatchArgs) / 4; i += kBlockSize) { - reinterpret_cast(&s_args)[i] = reinterpret_cast(&args)[i]; - } - - __syncthreads(); - */ - - // Global writes of IndexFlagTuple are inefficient due to its 16b size - // we can use cub to optimise this - constexpr int kTupleWords = sizeof(IndexFlagTuple)/sizeof(int); - typedef cub::BlockStore BlockStoreT; - __shared__ typename BlockStoreT::TempStorage temp_storage; - - // Use the raw pointer because the performance of global writes matters here - // We don't really need the bounds checking - IndexFlagTuple* out_ptr = scan_inputs.data(); - - /* - auto get_tuple = [&]__device__ (auto idx){ - int16_t batch_idx; - std::size_t item_idx; - s_args.AssignBatch(idx, batch_idx, item_idx); - auto op_res = op(ridx[item_idx], s_args.data[batch_idx]); - AtomicIncrement(d_left_counts.data(), op_res, batch_idx); - return IndexFlagTuple{bst_uint(item_idx), op_res, - bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res}; - }; - */ - - // Process full tiles - std::size_t tile_offset = blockIdx.x * kBlockSize; - while (tile_offset + kBlockSize <= n) { - std::size_t idx = tile_offset + threadIdx.x; - //auto tuple = get_tuple(idx); - auto tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op); - auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); - BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast(static_cast(&tuple))); - tile_offset += kBlockSize * gridDim.x; - } - - // Process partial tile - if (tile_offset < n) { - // Make sure we don't compute a negative number with unsigned integers - int valid_items = int(int64_t(n) - int64_t(tile_offset)); - std::size_t idx = tile_offset + threadIdx.x; - IndexFlagTuple tuple; - if (idx < n) { - tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op); - //tuple = get_tuple(idx); - } - - auto block_write_ptr = reinterpret_cast(out_ptr + tile_offset); - BlockStoreT(temp_storage) - .Store(block_write_ptr, *static_cast(static_cast(&tuple)), - valid_items * kTupleWords); - } -} - -template -void GetLeftCounts(const KernelBatchArgs& args, - const common::Span> batch_info, - common::Span ridx, common::Span scan_inputs, - common::Span d_left_counts, OpT op) { - // Launch 1 thread for each row - constexpr int kBlockSize = 256; - const int grid_size = - std::max(256,static_cast(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize))); - - - GetLeftCountsKernel<<>>(args, batch_info,ridx, scan_inputs,d_left_counts,op, args.TotalRows()); -} - // This is a transformer output iterator // It takes the result of the scan and performs the partition // To understand how a scan is used to partition elements see: @@ -225,11 +102,9 @@ void GetLeftCounts(const KernelBatchArgs& args, struct WriteResultsFunctor { bst_uint* ridx_in; bst_uint* ridx_out; - unsigned long long int* left_counts; + PartitionCountsT *counts; __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { - // the ex_scan_result represents how many rows have been assigned to left - // node so far during scan. std::size_t scatter_address; if (x.flag) { bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan @@ -237,16 +112,13 @@ struct WriteResultsFunctor { } else { bst_uint num_previous_unflagged = (x.idx - x.segment_start) - x.flag_scan; - // current number of rows belong to right node + total number of rows - // belong to left node - // scatter_address = (x.idx - x.flag_scan) + left_counts[x.batch_idx]; scatter_address = x.segment_end - num_previous_unflagged - 1; } ridx_out[scatter_address] = ridx_in[x.idx]; if (x.idx == (x.segment_end - 1)) { // Write out counts - left_counts[x.batch_idx] = x.flag_scan; + counts[x.batch_idx] = {x.flag_scan,0}; } // Discard @@ -255,13 +127,11 @@ struct WriteResultsFunctor { }; template -void SortPositionBatch(const KernelBatchArgs& args, - const common::Span> batch_info, - common::Span ridx, - common::Span ridx_tmp, common::Span scan_inputs, - common::Span left_counts, OpT op,cudaStream_t stream) { - //static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned."); - WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()}; +void SortPositionBatch(const common::Span> batch_info, + common::Span ridx, common::Span ridx_tmp, + common::Span d_counts, std::size_t total_rows, + OpT op, cudaStream_t stream) { + WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()}; auto discard_write_iterator = thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); @@ -273,20 +143,67 @@ void SortPositionBatch(const KernelBatchArgs& args, OpDataT data; AssignBatch(batch_info, idx, batch_idx, item_idx, data); auto op_res = op(ridx[item_idx], data); - return IndexFlagTuple{bst_uint(item_idx), op_res, - bst_uint(batch_info.data()[batch_idx].segment.begin),bst_uint(batch_info.data()[batch_idx].segment.end), batch_idx, + return IndexFlagTuple{bst_uint(item_idx), + op_res, + bst_uint(batch_info.data()[batch_idx].segment.begin), + bst_uint(batch_info.data()[batch_idx].segment.end), + batch_idx, op_res}; }); size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp(), - args.TotalRows(), stream); + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, + IndexFlagOp(), total_rows, stream); dh::TemporaryArray temp(temp_bytes); cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream); + discard_write_iterator, IndexFlagOp(), total_rows, stream); // copy active segments back to original buffer - dh::LaunchN(args.TotalRows(), stream, [=] __device__(std::size_t idx) { + dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) { + int16_t batch_idx; + std::size_t item_idx; + OpDataT data; + AssignBatch(batch_info, idx, batch_idx, item_idx, data); + ridx[item_idx] = ridx_tmp[item_idx]; + }); +} + +template +__global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel( + const common::Span> batch_info, common::Span ridx, + common::Span ridx_tmp, common::Span counts, OpT op, + std::size_t total_rows) { + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) { + int16_t batch_idx; + std::size_t item_idx; + OpDataT data; + AssignBatch(batch_info, idx, batch_idx, item_idx, data); + auto segment = batch_info[batch_idx].segment; + auto op_res = op(ridx[item_idx], data); + if (op_res) { + auto num_left_items = atomicAdd(&counts.data()[batch_idx].first, 1); + ridx_tmp[segment.begin + num_left_items] = ridx[item_idx]; + } else { + auto num_right_items = atomicAdd(&counts.data()[batch_idx].second, 1); + ridx_tmp[segment.end - num_right_items - 1] = ridx[item_idx]; + } + } +} + +template +void SortPositionBatchUnstable(const common::Span> batch_info, + common::Span ridx, common::Span ridx_tmp, + common::Span d_counts, std::size_t total_rows, + OpT op, cudaStream_t stream) { + + constexpr int kBlockSize = 256; + const int grid_size = + std::max(256, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); + + SortPositionBatchUnstableKernel + <<>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows); + + // copy active segments back to original buffer + dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) { int16_t batch_idx; std::size_t item_idx; OpDataT data; @@ -320,7 +237,6 @@ class RowPartitioner { dh::TemporaryArray ridx_; // Staging area for sorting ridx dh::TemporaryArray ridx_tmp_; - dh::TemporaryArray scan_inputs_; dh::PinnedMemory pinned_; dh::PinnedMemory pinned2_; std::vector streams_; @@ -368,53 +284,31 @@ class RowPartitioner { h_batch_info.size() * sizeof(KernelMemcpyArgs), cudaMemcpyDefault, streams_[1])); - // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying - // memory from device to host - for (std::size_t batch_start = 0; batch_start < nidx.size(); - batch_start += KernelBatchArgs::kMaxBatch) { - // Temporary arrays - auto h_left_counts = pinned_.GetSpan(KernelBatchArgs::kMaxBatch, 0); - dh::TemporaryArray d_left_counts(KernelBatchArgs::kMaxBatch, 0); - - std::size_t batch_end = std::min(batch_start + KernelBatchArgs::kMaxBatch, nidx.size()); - // Prepare kernel arguments - KernelBatchArgs args; - std::copy(op_data.begin() + batch_start, op_data.begin() + batch_end, args.data); - for (int i = 0; i < (batch_end - batch_start); i++) { - args.segments[i] = ridx_segments_.at(nidx[batch_start + i]); - } - - // Evaluate the operator for each row, where true means 'go left' - // Store the result of the operator for the next step - // Count the number of rows going left, store in d_left_counts - //GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), - //dh::ToSpan(d_left_counts), op); - - - // Partition the rows according to the operator - SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), - dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op,streams_[1]); - dh::safe_cuda( - cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(), - sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(), - cudaMemcpyDefault, streams_[1])); - - dh::safe_cuda(cudaStreamSynchronize(streams_[1])); - - // Update segments - for (int i = 0; i < (batch_end - batch_start); i++) { - auto segment = ridx_segments_.at(nidx[batch_start + i]); - auto left_count = h_left_counts[i]; - CHECK_LE(left_count, segment.Size()); - CHECK_GE(left_count, 0); - ridx_segments_.resize( - std::max(static_cast(ridx_segments_.size()), - std::max(left_nidx[batch_start + i], right_nidx[batch_start + i]) + 1)); - ridx_segments_[left_nidx[batch_start + i]] = - Segment(segment.begin, segment.begin + left_count); - ridx_segments_[right_nidx[batch_start + i]] = - Segment(segment.begin + left_count, segment.end); - } + // Temporary arrays + auto h_counts = pinned_.GetSpan(nidx.size(), PartitionCountsT{}); + dh::TemporaryArray d_counts(nidx.size(), PartitionCountsT{}); + + // Partition the rows according to the operator + SortPositionBatchUnstable( dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), + dh::ToSpan(d_counts), total_rows,op, + streams_[1]); + dh::safe_cuda( + cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), + sizeof(decltype(d_counts)::value_type) * d_counts.size(), + cudaMemcpyDefault, streams_[1])); + + dh::safe_cuda(cudaStreamSynchronize(streams_[1])); + + // Update segments + for (int i = 0; i < nidx.size(); i++) { + auto segment = ridx_segments_.at(nidx[i]); + auto left_count = h_counts[i].first; + CHECK_LE(left_count, segment.Size()); + CHECK_GE(left_count, 0); + ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), + std::max(left_nidx[i], right_nidx[i]) + 1)); + ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count); + ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end); } } }; diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index c29032c5790d..21628aace40b 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -55,8 +55,7 @@ TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); } void TestSortPositionBatch(const std::vector& ridx_in, const std::vector& segments) { thrust::device_vector ridx = ridx_in; thrust::device_vector ridx_tmp(ridx_in.size()); - thrust::device_vector left_counts(segments.size()); - thrust::device_vector scan_tmp(ridx_in.size()); + thrust::device_vector counts(segments.size()); auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; }; std::vector op_data(segments.size()); @@ -71,23 +70,19 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector), cudaMemcpyDefault, nullptr)); - KernelBatchArgs args; - std::copy(segments.begin(), segments.end(), args.segments); - std::copy(op_data.begin(), op_data.end(), args.data); - GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(scan_tmp), - dh::ToSpan(left_counts), op); - SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), - dh::ToSpan(scan_tmp), dh::ToSpan(left_counts),op, nullptr); + SortPositionBatchUnstable(dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), + dh::ToSpan(counts), total_rows, op, nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; for (int i = 0; i < segments.size(); i++) { auto begin = ridx.begin() + segments[i].begin; auto end = ridx.begin() + segments[i].end; + PartitionCountsT count = counts[i]; auto left_partition_count = - thrust::count_if(thrust::device, begin, begin + left_counts[i], op_without_data); - EXPECT_EQ(left_partition_count, left_counts[i]); + thrust::count_if(thrust::device, begin, begin + count.first, op_without_data); + EXPECT_EQ(left_partition_count, count.first); auto right_partition_count = - thrust::count_if(thrust::device, begin + left_counts[i], end, op_without_data); + thrust::count_if(thrust::device, begin + count.first, end, op_without_data); EXPECT_EQ(right_partition_count, 0); } } From 14d866306832a677c821f247ff4dcada95338c6a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 31 May 2022 04:25:44 -0700 Subject: [PATCH 42/64] Warp aggregates --- src/tree/gpu_hist/row_partitioner.cuh | 87 ++++++++++++------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 3 +- 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index f062e2a4ed48..238a88a430e1 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -75,25 +75,6 @@ struct IndexFlagOp { }; -/*! \brief Count how many rows are assigned to left node. */ -__forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment, - int batch_idx) { - int mask = __activemask(); - int leader = __ffs(mask) - 1; - bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader)); - // If all threads here are working on the same node - // we can do a more efficient reduction with warp intrinsics - if (group_is_contiguous) { - unsigned ballot = __ballot_sync(mask, increment); - if (threadIdx.x % 32 == leader) { - atomicAdd(d_counts + batch_idx, // NOLINT - __popc(ballot)); // NOLINT - } - } else { - atomicAdd(d_counts + batch_idx, increment); - } -} - // This is a transformer output iterator // It takes the result of the scan and performs the partition // To understand how a scan is used to partition elements see: @@ -167,34 +148,76 @@ void SortPositionBatch(const common::Span> batch_info, }); } + +__forceinline__ __device__ uint32_t __lanemask_lt() { return ((uint32_t)1 << cub::LaneId()) - 1; } + +/*! \brief Count how many rows are assigned to left node. */ +__forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, bool go_left, + int16_t batch_idx) { + int mask = __activemask(); + int leader = __ffs(mask) - 1; + unsigned int prefix = __popc(mask & __lanemask_lt()); + bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader)); + // If all threads here are working on the same node + // we can do a more efficient reduction with warp intrinsics + if (group_is_contiguous) { + unsigned ballot = __ballot_sync(mask, go_left); + uint32_t global_left_count = 0; + uint32_t global_right_count = 0; + if (prefix == 0) { + global_left_count = atomicAdd(&d_counts->first, __popc(ballot)); + global_right_count = atomicAdd(&d_counts->second, __popc(mask) - __popc(ballot)); + } + global_left_count = __shfl_sync(mask, global_left_count, leader); + global_right_count = __shfl_sync(mask, global_right_count, leader); + uint32_t local_left_count = __popc(ballot & __lanemask_lt()); + uint32_t local_right_count = __popc(mask & __lanemask_lt()) - local_left_count; + + if (go_left) { + return global_left_count + local_left_count; + } else { + return global_right_count + local_right_count; + } + + } else { + auto address = go_left ? &d_counts->first : &d_counts->second; + return atomicAdd(address, 1); + } +} + template __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel( - const common::Span> batch_info, common::Span ridx, + const common::Span> d_batch_info, common::Span d_ridx, common::Span ridx_tmp, common::Span counts, OpT op, std::size_t total_rows) { + __shared__ KernelMemcpyArgs s_batch_info[32]; + for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) { + s_batch_info[i] = d_batch_info.data()[i]; + } + const common::Span> batch_info(s_batch_info, d_batch_info.size()); + __syncthreads(); + for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) { int16_t batch_idx; std::size_t item_idx; OpDataT data; AssignBatch(batch_info, idx, batch_idx, item_idx, data); auto segment = batch_info[batch_idx].segment; - auto op_res = op(ridx[item_idx], data); - if (op_res) { - auto num_left_items = atomicAdd(&counts.data()[batch_idx].first, 1); - ridx_tmp[segment.begin + num_left_items] = ridx[item_idx]; - } else { - auto num_right_items = atomicAdd(&counts.data()[batch_idx].second, 1); - ridx_tmp[segment.end - num_right_items - 1] = ridx[item_idx]; - } + auto ridx = d_ridx[item_idx]; + auto op_res = op(ridx, data); + auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx); + auto destination_address = + op_res ? segment.begin + current_num_items : segment.end - current_num_items - 1; + ridx_tmp[destination_address] = ridx; } } template void SortPositionBatchUnstable(const common::Span> batch_info, - common::Span ridx, common::Span ridx_tmp, - common::Span d_counts, std::size_t total_rows, - OpT op, cudaStream_t stream) { - + common::Span ridx, common::Span ridx_tmp, + common::Span d_counts, std::size_t total_rows, + OpT op, cudaStream_t stream) { + CHECK_LE(batch_info.size(), 32); constexpr int kBlockSize = 256; const int grid_size = std::max(256, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 21628aace40b..3ff18a016f33 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -94,6 +94,7 @@ TEST(GpuHist, SortPositionBatch) { TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}}); } +/* void TestAtomicIncrement(const std::vector& group_in, const std::vector& increment_in) { thrust::device_vector group(group_in); thrust::device_vector increment(increment_in); @@ -115,6 +116,6 @@ void TestAtomicIncrement(const std::vector& group_in, const std::vector Date: Wed, 1 Jun 2022 02:54:05 -0700 Subject: [PATCH 43/64] Cleanup --- src/tree/gpu_hist/row_partitioner.cu | 9 +- src/tree/gpu_hist/row_partitioner.cuh | 124 ++---------------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 23 ---- 3 files changed, 13 insertions(+), 143 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 7676c8e67495..cc117ae743e0 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -16,17 +16,12 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(Segment(0, num_rows)); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); - streams_.resize(2); - for (auto& stream : streams_) { - dh::safe_cuda(cudaStreamCreate(&stream)); - } + dh::safe_cuda(cudaStreamCreate(&stream_)); } RowPartitioner::~RowPartitioner() { dh::safe_cuda(cudaSetDevice(device_idx_)); - for (auto& stream : streams_) { - dh::safe_cuda(cudaStreamDestroy(stream)); - } + dh::safe_cuda(cudaStreamDestroy(stream_)); } common::Span RowPartitioner::GetRows( diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 238a88a430e1..ca59fff337ab 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -51,117 +51,18 @@ __device__ void AssignBatch(const common::Span> batch_ } } -// We can scan over this tuple, where the scan gives us information on how to partition inputs -// according to the flag -struct IndexFlagTuple { - bst_uint idx; // The location of the item we are working on in ridx_ - bst_uint flag_scan; // This gets populated after scanning - bst_uint segment_start; // Start offset of this node segment - bst_uint segment_end; // End offset of this node segment - int16_t batch_idx; // Which node in the batch does this item belong to - bool flag; // Result of op (is this item going left?) -}; - -struct IndexFlagOp { - __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { - // Segmented scan - resets if we cross batch boundaries - if (a.batch_idx == b.batch_idx) { - // Accumulate the flags, everything else stays the same - return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.segment_end,b.batch_idx, b.flag}; - } else { - return b; - } - } -}; - - -// This is a transformer output iterator -// It takes the result of the scan and performs the partition -// To understand how a scan is used to partition elements see: -// Harris, Mark, Shubhabrata Sengupta, and John D. Owens. "Parallel prefix sum (scan) with CUDA." -// GPU gems 3.39 (2007): 851-876. -struct WriteResultsFunctor { - bst_uint* ridx_in; - bst_uint* ridx_out; - PartitionCountsT *counts; - - __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { - std::size_t scatter_address; - if (x.flag) { - bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan - scatter_address = x.segment_start + num_previous_flagged; - } else { - - bst_uint num_previous_unflagged = (x.idx - x.segment_start) - x.flag_scan; - scatter_address = x.segment_end - num_previous_unflagged - 1; - } - ridx_out[scatter_address] = ridx_in[x.idx]; - - if (x.idx == (x.segment_end - 1)) { - // Write out counts - counts[x.batch_idx] = {x.flag_scan,0}; - } - - // Discard - return {}; - } -}; - -template -void SortPositionBatch(const common::Span> batch_info, - common::Span ridx, common::Span ridx_tmp, - common::Span d_counts, std::size_t total_rows, - OpT op, cudaStream_t stream) { - WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()}; - - auto discard_write_iterator = - thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); - auto counting = thrust::make_counting_iterator(0llu); - auto input_iterator = - dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { - int16_t batch_idx; - std::size_t item_idx; - OpDataT data; - AssignBatch(batch_info, idx, batch_idx, item_idx, data); - auto op_res = op(ridx[item_idx], data); - return IndexFlagTuple{bst_uint(item_idx), - op_res, - bst_uint(batch_info.data()[batch_idx].segment.begin), - bst_uint(batch_info.data()[batch_idx].segment.end), - batch_idx, - op_res}; - }); - size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, - IndexFlagOp(), total_rows, stream); - dh::TemporaryArray temp(temp_bytes); - cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, - discard_write_iterator, IndexFlagOp(), total_rows, stream); - - // copy active segments back to original buffer - dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) { - int16_t batch_idx; - std::size_t item_idx; - OpDataT data; - AssignBatch(batch_info, idx, batch_idx, item_idx, data); - ridx[item_idx] = ridx_tmp[item_idx]; - }); -} - - __forceinline__ __device__ uint32_t __lanemask_lt() { return ((uint32_t)1 << cub::LaneId()) - 1; } -/*! \brief Count how many rows are assigned to left node. */ __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, bool go_left, int16_t batch_idx) { int mask = __activemask(); int leader = __ffs(mask) - 1; - unsigned int prefix = __popc(mask & __lanemask_lt()); + uint32_t prefix = __popc(mask & __lanemask_lt()); bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader)); // If all threads here are working on the same node // we can do a more efficient reduction with warp intrinsics if (group_is_contiguous) { - unsigned ballot = __ballot_sync(mask, go_left); + uint32_t ballot = __ballot_sync(mask, go_left); uint32_t global_left_count = 0; uint32_t global_right_count = 0; if (prefix == 0) { @@ -173,11 +74,7 @@ __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, uint32_t local_left_count = __popc(ballot & __lanemask_lt()); uint32_t local_right_count = __popc(mask & __lanemask_lt()) - local_left_count; - if (go_left) { - return global_left_count + local_left_count; - } else { - return global_right_count + local_right_count; - } + return go_left ? global_left_count + local_left_count : global_right_count + local_right_count; } else { auto address = go_left ? &d_counts->first : &d_counts->second; @@ -185,7 +82,7 @@ __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, } } -template +template __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel( const common::Span> d_batch_info, common::Span d_ridx, common::Span ridx_tmp, common::Span counts, OpT op, @@ -197,7 +94,8 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel( const common::Span> batch_info(s_batch_info, d_batch_info.size()); __syncthreads(); - for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) { + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; + idx += blockDim.x * gridDim.x) { int16_t batch_idx; std::size_t item_idx; OpDataT data; @@ -262,7 +160,7 @@ class RowPartitioner { dh::TemporaryArray ridx_tmp_; dh::PinnedMemory pinned_; dh::PinnedMemory pinned2_; - std::vector streams_; + cudaStream_t stream_; public: RowPartitioner(int device_idx, size_t num_rows); @@ -305,7 +203,7 @@ class RowPartitioner { } dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), h_batch_info.size() * sizeof(KernelMemcpyArgs), - cudaMemcpyDefault, streams_[1])); + cudaMemcpyDefault, stream_)); // Temporary arrays auto h_counts = pinned_.GetSpan(nidx.size(), PartitionCountsT{}); @@ -314,13 +212,13 @@ class RowPartitioner { // Partition the rows according to the operator SortPositionBatchUnstable( dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows,op, - streams_[1]); + stream_); dh::safe_cuda( cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), sizeof(decltype(d_counts)::value_type) * d_counts.size(), - cudaMemcpyDefault, streams_[1])); + cudaMemcpyDefault, stream_)); - dh::safe_cuda(cudaStreamSynchronize(streams_[1])); + dh::safe_cuda(cudaStreamSynchronize(stream_)); // Update segments for (int i = 0; i < nidx.size(); i++) { diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 3ff18a016f33..d0e0f850a191 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -94,28 +94,5 @@ TEST(GpuHist, SortPositionBatch) { TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}}); } -/* -void TestAtomicIncrement(const std::vector& group_in, const std::vector& increment_in) { - thrust::device_vector group(group_in); - thrust::device_vector increment(increment_in); - thrust::device_vector reference(group_in.size()); - thrust::device_vector result(group_in.size()); - - auto d_group = group.data().get(); - auto d_increment = increment.data().get(); - auto d_reference = reference.data().get(); - auto d_result = result.data().get(); - dh::LaunchN(group.size(), [=] __device__(std::size_t idx) { - AtomicIncrement(d_result, d_increment[idx], d_group[idx]); - atomicAdd(d_reference + d_group[idx], d_increment[idx]); - }); - - EXPECT_EQ(reference, result); -} - -TEST(GpuHist, AtomicIncrement) { - TestAtomicIncrement({0, 0, 0}, {1, 0, 1}); - TestAtomicIncrement({0, 0, 1}, {1, 0, 1}); -}*/ } // namespace tree } // namespace xgboost From a764986612d46b0ebc82c346978bd184ece45811 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 1 Jun 2022 06:23:43 -0700 Subject: [PATCH 44/64] Use pointer for shared memory --- src/tree/gpu_hist/row_partitioner.cuh | 104 +++++++++++------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 11 +- 2 files changed, 70 insertions(+), 45 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index ca59fff337ab..f44fd8d0962e 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -28,26 +28,25 @@ struct Segment { using PartitionCountsT = thrust::pair; +// TODO(Rory): Can be larger. To be tuned alongside other batch operations. +static const int kMaxUpdatePositionBatchSize = 32; template -struct KernelMemcpyArgs { +struct PerNodeData { Segment segment; OpDataT data; }; template -__device__ void AssignBatch(const common::Span> batch_info, - std::size_t idx, int16_t& batch_idx, std::size_t& item_idx, OpDataT&data) { - const auto ptr = batch_info.data(); +__device__ __forceinline__ void AssignBatch(const PerNodeData *batch_info, + std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) { std::size_t sum = 0; - - for (int16_t i = 0; i < batch_info.size(); i++) { - if (sum + ptr[i].segment.Size() > idx) { - batch_idx = i; - item_idx = (idx - sum) + ptr[i].segment.begin; - data = ptr[i].data; + for (int16_t i = 0; i < kMaxUpdatePositionBatchSize; i++) { + if (sum + batch_info[i].segment.Size() > global_thread_idx) { + *batch_idx = i; + *item_idx = (global_thread_idx - sum) + batch_info[i].segment.begin; break; } - sum += ptr[i].segment.Size(); + sum += batch_info[i].segment.Size(); } } @@ -82,40 +81,70 @@ __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, } } -template +template +struct SharedStorage { + PerNodeData data[kMaxUpdatePositionBatchSize]; + // Collectively load from global memory into shared memory + template + __device__ const PerNodeData* BlockLoad( + const common::Span> d_batch_info) { + for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) { + data[i] = d_batch_info.data()[i]; + } + __syncthreads(); + return data; + } +}; + +template __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel( - const common::Span> d_batch_info, common::Span d_ridx, + const common::Span> d_batch_info, common::Span d_ridx, common::Span ridx_tmp, common::Span counts, OpT op, std::size_t total_rows) { - __shared__ KernelMemcpyArgs s_batch_info[32]; - for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) { - s_batch_info[i] = d_batch_info.data()[i]; - } - const common::Span> batch_info(s_batch_info, d_batch_info.size()); - __syncthreads(); + // Initialise shared memory this way to avoid calling constructors + __shared__ cub::Uninitialized> shared; + auto batch_info = shared.Alias().BlockLoad(d_batch_info); for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) { - int16_t batch_idx; + int batch_idx; std::size_t item_idx; - OpDataT data; - AssignBatch(batch_info, idx, batch_idx, item_idx, data); - auto segment = batch_info[batch_idx].segment; + AssignBatch(batch_info, idx,&batch_idx, &item_idx); auto ridx = d_ridx[item_idx]; - auto op_res = op(ridx, data); + auto op_res = op(ridx, batch_info[batch_idx].data); auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx); + auto segment = batch_info[batch_idx].segment; auto destination_address = op_res ? segment.begin + current_num_items : segment.end - current_num_items - 1; ridx_tmp[destination_address] = ridx; } } +template +__global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel( + const common::Span> d_batch_info, common::Span d_ridx, + common::Span ridx_tmp, + std::size_t total_rows) { + + __shared__ cub::Uninitialized> shared; + auto batch_info = shared.Alias().BlockLoad(d_batch_info); + + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; + idx += blockDim.x * gridDim.x) { + int batch_idx; + std::size_t item_idx; + AssignBatch(batch_info, idx,&batch_idx, &item_idx); + d_ridx[item_idx] = ridx_tmp[item_idx]; + } +} + template -void SortPositionBatchUnstable(const common::Span> batch_info, +void SortPositionBatchUnstable(const common::Span> batch_info, common::Span ridx, common::Span ridx_tmp, common::Span d_counts, std::size_t total_rows, OpT op, cudaStream_t stream) { - CHECK_LE(batch_info.size(), 32); + CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize); constexpr int kBlockSize = 256; const int grid_size = std::max(256, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); @@ -123,14 +152,8 @@ void SortPositionBatchUnstable(const common::Span> bat SortPositionBatchUnstableKernel <<>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows); - // copy active segments back to original buffer - dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) { - int16_t batch_idx; - std::size_t item_idx; - OpDataT data; - AssignBatch(batch_info, idx, batch_idx, item_idx, data); - ridx[item_idx] = ridx_tmp[item_idx]; - }); + SortPositionCopyKernel + <<>>(batch_info, ridx, ridx_tmp, total_rows); } /** \brief Class responsible for tracking subsets of rows as we add splits and @@ -193,8 +216,8 @@ class RowPartitioner { CHECK_EQ(nidx.size(), right_nidx.size()); CHECK_EQ(nidx.size(), op_data.size()); - auto h_batch_info = pinned2_.GetSpan>(nidx.size()); - dh::TemporaryArray> d_batch_info(nidx.size()); + auto h_batch_info = pinned2_.GetSpan>(nidx.size()); + dh::TemporaryArray> d_batch_info(nidx.size()); std::size_t total_rows = 0; for (int i = 0; i < nidx.size(); i++) { @@ -202,7 +225,7 @@ class RowPartitioner { total_rows += ridx_segments_.at(nidx.at(i)).Size(); } dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), - h_batch_info.size() * sizeof(KernelMemcpyArgs), + h_batch_info.size() * sizeof(PerNodeData), cudaMemcpyDefault, stream_)); // Temporary arrays @@ -210,9 +233,10 @@ class RowPartitioner { dh::TemporaryArray d_counts(nidx.size(), PartitionCountsT{}); // Partition the rows according to the operator - SortPositionBatchUnstable( dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), - dh::ToSpan(d_counts), total_rows,op, - stream_); + SortPositionBatchUnstable(common::Span>( + d_batch_info.data().get(), d_batch_info.size()), + dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), + total_rows, op, stream_); dh::safe_cuda( cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), sizeof(decltype(d_counts)::value_type) * d_counts.size(), diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index d0e0f850a191..92bccff35330 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -59,8 +59,8 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector op_data(segments.size()); - std::vector> h_batch_info(segments.size()); - dh::TemporaryArray> d_batch_info(segments.size()); + std::vector> h_batch_info(segments.size()); + dh::TemporaryArray> d_batch_info(segments.size()); std::size_t total_rows = 0; for (int i = 0; i < segments.size(); i++) { @@ -68,10 +68,11 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector), + h_batch_info.size() * sizeof(PerNodeData), cudaMemcpyDefault, nullptr)); - SortPositionBatchUnstable(dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), - dh::ToSpan(counts), total_rows, op, nullptr); + SortPositionBatchUnstable( + common::Span>(d_batch_info.data().get(), d_batch_info.size()), + dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op, nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; for (int i = 0; i < segments.size(); i++) { From 001c2f267ef8e7bd722e0ee66a93da4a6d823e12 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 2 Jun 2022 08:08:44 -0700 Subject: [PATCH 45/64] Row partitioner grid --- src/tree/gpu_hist/row_partitioner.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index f44fd8d0962e..70920ffcc3b3 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -110,7 +110,7 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel( idx += blockDim.x * gridDim.x) { int batch_idx; std::size_t item_idx; - AssignBatch(batch_info, idx,&batch_idx, &item_idx); + AssignBatch(batch_info, idx, &batch_idx, &item_idx); auto ridx = d_ridx[item_idx]; auto op_res = op(ridx, batch_info[batch_idx].data); auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx); @@ -147,7 +147,7 @@ void SortPositionBatchUnstable(const common::Span> ba CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize); constexpr int kBlockSize = 256; const int grid_size = - std::max(256, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); + std::min(256, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); SortPositionBatchUnstableKernel <<>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows); From 70bad86552e6fe40b0c6434c9623426fb003e13e Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 2 Jun 2022 08:09:57 -0700 Subject: [PATCH 46/64] Custom FinalizePositionKernel --- src/tree/gpu_hist/finalize_position.cuh | 113 ++++++++++++++++++++++++ src/tree/updater_gpu_hist.cu | 62 ++----------- 2 files changed, 118 insertions(+), 57 deletions(-) create mode 100644 src/tree/gpu_hist/finalize_position.cuh diff --git a/src/tree/gpu_hist/finalize_position.cuh b/src/tree/gpu_hist/finalize_position.cuh new file mode 100644 index 000000000000..0f5ec36f649f --- /dev/null +++ b/src/tree/gpu_hist/finalize_position.cuh @@ -0,0 +1,113 @@ +/*! + * Copyright 2017-2022 XGBoost contributors + */ +#pragma once +#include "xgboost/base.h" +#include "xgboost/data.h" +#include "xgboost/span.h" +#include "../../data/ellpack_page.cuh" + +namespace xgboost { +namespace tree { + +template +__device__ const RegTree::Node *LoadTree(common::Span d_nodes, int *smem) { + if (!kUseShared) { + return d_nodes.data(); + } + + auto nodes = reinterpret_cast(smem); + for (int i = threadIdx.x; i < d_nodes.size(); i += kBlockSize) { + nodes[i]=d_nodes[i]; + } + __syncthreads(); + return nodes; +} + +template +__global__ __launch_bounds__(kBlockSize) void FinalizePositionKernel( + common::Span d_nodes, common::Span feature_types, + common::Span categories, + common::Span categories_segments, + common::Span gradients, const EllpackDeviceAccessor dmatrix, + common::Span predictions, common::Span position) { + extern __shared__ int s[]; + auto nodes = LoadTree(d_nodes, s); + auto new_position_op = [&] __device__(size_t row_id) { + // What happens if user prune the tree? + if (!dmatrix.IsInRange(row_id)) { + return -1; + } + int row_position = RegTree::kRoot; + auto node = nodes[row_position]; + + while (!node.IsLeaf()) { + bst_float element = dmatrix.GetFvalue(row_id, node.SplitIndex()); + // Missing value + if (isnan(element)) { + row_position = node.DefaultChild(); + } else { + bool go_left = true; + if (common::IsCat(feature_types, row_position)) { + auto node_cats = categories.subspan(categories_segments[row_position].beg, + categories_segments[row_position].size); + go_left = common::Decision(node_cats, element, node.DefaultLeft()); + } else { + go_left = element <= node.SplitCond(); + } + if (go_left) { + row_position = node.LeftChild(); + } else { + row_position = node.RightChild(); + } + } + node = nodes[row_position]; + } + + return row_position; + }; // NOLINT + + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < position.size(); + idx += blockDim.x * gridDim.x) { + bst_node_t row_position = new_position_op(idx); + predictions[idx] = nodes[row_position].LeafValue(); + // FIXME(jiamingy): Doesn't work when sampling is used with external memory as + // the sampler compacts the gradient vector. + bool is_sampled = gradients[idx].GetHess() - .0f == 0.f; + position[idx] = is_sampled ? ~row_position : row_position; + } +} + +inline void CallFinalizePosition(common::Span nodes, + common::Span feature_types, + common::Span categories, + common::Span categories_segments, + common::Span gradients, + const EllpackDeviceAccessor dmatrix, + common::Span predictions, + common::Span position){ + + // Use shared memory? + int device = 0; + dh::safe_cuda(cudaGetDevice(&device)); + int max_shared_memory = dh::MaxSharedMemoryOptin(device); + size_t smem_size = sizeof( RegTree::Node) * + nodes.size(); + bool shared = smem_size <= max_shared_memory; + smem_size = shared ? smem_size : 0; + constexpr int kBlockSize = 256; + const int grid_size = + std::min(256, static_cast(xgboost::common::DivRoundUp(position.size(), kBlockSize))); + + if (shared) { + FinalizePositionKernel + <<>>(nodes, feature_types, categories, categories_segments, + gradients, dmatrix, predictions, position); + } else { + FinalizePositionKernel + <<>>(nodes, feature_types, categories, categories_segments, + gradients, dmatrix, predictions, position); + } +} +}; // namespace tree +}; // namespace xgboost \ No newline at end of file diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3d1c38ba51d7..a6a186b61bbd 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -38,6 +38,7 @@ #include "gpu_hist/histogram.cuh" #include "gpu_hist/evaluate_splits.cuh" #include "gpu_hist/expand_entry.cuh" +#include "gpu_hist/finalize_position.cuh" #include "xgboost/task.h" #include "xgboost/tree_model.h" @@ -436,66 +437,13 @@ struct GPUHistMakerDevice { dh::CopyToD(categories_segments, &d_categories_segments); } - FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), - dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task, - p_out_position); - } - - void FinalisePositionInPage(EllpackPageImpl const *page, - const common::Span d_nodes, - common::Span d_feature_types, - common::Span categories, - common::Span categories_segments, - ObjInfo task, - HostDeviceVector* p_out_position) { - auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); - auto d_gpair = this->gpair; - auto new_position_op = [=] __device__(size_t row_id) { - // What happens if user prune the tree? - if (!d_matrix.IsInRange(row_id)) { - return -1; - } - int position = RegTree::kRoot; - auto node = d_nodes[position]; - - while (!node.IsLeaf()) { - bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex()); - // Missing value - if (isnan(element)) { - position = node.DefaultChild(); - } else { - bool go_left = true; - if (common::IsCat(d_feature_types, position)) { - auto node_cats = categories.subspan(categories_segments[position].beg, - categories_segments[position].size); - go_left = common::Decision(node_cats, element, node.DefaultLeft()); - } else { - go_left = element <= node.SplitCond(); - } - if (go_left) { - position = node.LeftChild(); - } else { - position = node.RightChild(); - } - } - node = d_nodes[position]; - } - - return position; - }; // NOLINT p_out_position->SetDevice(ctx_->gpu_id); p_out_position->Resize(page->n_rows); update_predictions.resize(page->n_rows); - auto d_update_predictions = dh::ToSpan(update_predictions); - auto sorted_position = p_out_position->DevicePointer(); - dh::LaunchN(page->n_rows, [=] __device__(size_t idx) { - bst_node_t position = new_position_op(idx); - d_update_predictions[idx] = d_nodes[position].LeafValue(); - // FIXME(jiamingy): Doesn't work when sampling is used with external memory as - // the sampler compacts the gradient vector. - bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f; - sorted_position[idx] = is_sampled ? ~position : position; - }); + CallFinalizePosition(dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), dh::ToSpan(d_categories), + dh::ToSpan(d_categories_segments), this->gpair, + page->GetDeviceAccessor(ctx_->gpu_id), dh::ToSpan(update_predictions), + p_out_position->DeviceSpan()); } bool UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { From 31e02f0614145b020274617bc3917f26aabad13a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 2 Jun 2022 08:12:08 -0700 Subject: [PATCH 47/64] Revert "Custom FinalizePositionKernel" This reverts commit 70bad86552e6fe40b0c6434c9623426fb003e13e. --- src/tree/gpu_hist/finalize_position.cuh | 113 ------------------------ src/tree/updater_gpu_hist.cu | 62 +++++++++++-- 2 files changed, 57 insertions(+), 118 deletions(-) delete mode 100644 src/tree/gpu_hist/finalize_position.cuh diff --git a/src/tree/gpu_hist/finalize_position.cuh b/src/tree/gpu_hist/finalize_position.cuh deleted file mode 100644 index 0f5ec36f649f..000000000000 --- a/src/tree/gpu_hist/finalize_position.cuh +++ /dev/null @@ -1,113 +0,0 @@ -/*! - * Copyright 2017-2022 XGBoost contributors - */ -#pragma once -#include "xgboost/base.h" -#include "xgboost/data.h" -#include "xgboost/span.h" -#include "../../data/ellpack_page.cuh" - -namespace xgboost { -namespace tree { - -template -__device__ const RegTree::Node *LoadTree(common::Span d_nodes, int *smem) { - if (!kUseShared) { - return d_nodes.data(); - } - - auto nodes = reinterpret_cast(smem); - for (int i = threadIdx.x; i < d_nodes.size(); i += kBlockSize) { - nodes[i]=d_nodes[i]; - } - __syncthreads(); - return nodes; -} - -template -__global__ __launch_bounds__(kBlockSize) void FinalizePositionKernel( - common::Span d_nodes, common::Span feature_types, - common::Span categories, - common::Span categories_segments, - common::Span gradients, const EllpackDeviceAccessor dmatrix, - common::Span predictions, common::Span position) { - extern __shared__ int s[]; - auto nodes = LoadTree(d_nodes, s); - auto new_position_op = [&] __device__(size_t row_id) { - // What happens if user prune the tree? - if (!dmatrix.IsInRange(row_id)) { - return -1; - } - int row_position = RegTree::kRoot; - auto node = nodes[row_position]; - - while (!node.IsLeaf()) { - bst_float element = dmatrix.GetFvalue(row_id, node.SplitIndex()); - // Missing value - if (isnan(element)) { - row_position = node.DefaultChild(); - } else { - bool go_left = true; - if (common::IsCat(feature_types, row_position)) { - auto node_cats = categories.subspan(categories_segments[row_position].beg, - categories_segments[row_position].size); - go_left = common::Decision(node_cats, element, node.DefaultLeft()); - } else { - go_left = element <= node.SplitCond(); - } - if (go_left) { - row_position = node.LeftChild(); - } else { - row_position = node.RightChild(); - } - } - node = nodes[row_position]; - } - - return row_position; - }; // NOLINT - - for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < position.size(); - idx += blockDim.x * gridDim.x) { - bst_node_t row_position = new_position_op(idx); - predictions[idx] = nodes[row_position].LeafValue(); - // FIXME(jiamingy): Doesn't work when sampling is used with external memory as - // the sampler compacts the gradient vector. - bool is_sampled = gradients[idx].GetHess() - .0f == 0.f; - position[idx] = is_sampled ? ~row_position : row_position; - } -} - -inline void CallFinalizePosition(common::Span nodes, - common::Span feature_types, - common::Span categories, - common::Span categories_segments, - common::Span gradients, - const EllpackDeviceAccessor dmatrix, - common::Span predictions, - common::Span position){ - - // Use shared memory? - int device = 0; - dh::safe_cuda(cudaGetDevice(&device)); - int max_shared_memory = dh::MaxSharedMemoryOptin(device); - size_t smem_size = sizeof( RegTree::Node) * - nodes.size(); - bool shared = smem_size <= max_shared_memory; - smem_size = shared ? smem_size : 0; - constexpr int kBlockSize = 256; - const int grid_size = - std::min(256, static_cast(xgboost::common::DivRoundUp(position.size(), kBlockSize))); - - if (shared) { - FinalizePositionKernel - <<>>(nodes, feature_types, categories, categories_segments, - gradients, dmatrix, predictions, position); - } else { - FinalizePositionKernel - <<>>(nodes, feature_types, categories, categories_segments, - gradients, dmatrix, predictions, position); - } -} -}; // namespace tree -}; // namespace xgboost \ No newline at end of file diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index a6a186b61bbd..3d1c38ba51d7 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -38,7 +38,6 @@ #include "gpu_hist/histogram.cuh" #include "gpu_hist/evaluate_splits.cuh" #include "gpu_hist/expand_entry.cuh" -#include "gpu_hist/finalize_position.cuh" #include "xgboost/task.h" #include "xgboost/tree_model.h" @@ -437,13 +436,66 @@ struct GPUHistMakerDevice { dh::CopyToD(categories_segments, &d_categories_segments); } + FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), + dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task, + p_out_position); + } + + void FinalisePositionInPage(EllpackPageImpl const *page, + const common::Span d_nodes, + common::Span d_feature_types, + common::Span categories, + common::Span categories_segments, + ObjInfo task, + HostDeviceVector* p_out_position) { + auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); + auto d_gpair = this->gpair; + auto new_position_op = [=] __device__(size_t row_id) { + // What happens if user prune the tree? + if (!d_matrix.IsInRange(row_id)) { + return -1; + } + int position = RegTree::kRoot; + auto node = d_nodes[position]; + + while (!node.IsLeaf()) { + bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex()); + // Missing value + if (isnan(element)) { + position = node.DefaultChild(); + } else { + bool go_left = true; + if (common::IsCat(d_feature_types, position)) { + auto node_cats = categories.subspan(categories_segments[position].beg, + categories_segments[position].size); + go_left = common::Decision(node_cats, element, node.DefaultLeft()); + } else { + go_left = element <= node.SplitCond(); + } + if (go_left) { + position = node.LeftChild(); + } else { + position = node.RightChild(); + } + } + node = d_nodes[position]; + } + + return position; + }; // NOLINT p_out_position->SetDevice(ctx_->gpu_id); p_out_position->Resize(page->n_rows); update_predictions.resize(page->n_rows); - CallFinalizePosition(dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), dh::ToSpan(d_categories), - dh::ToSpan(d_categories_segments), this->gpair, - page->GetDeviceAccessor(ctx_->gpu_id), dh::ToSpan(update_predictions), - p_out_position->DeviceSpan()); + auto d_update_predictions = dh::ToSpan(update_predictions); + auto sorted_position = p_out_position->DevicePointer(); + dh::LaunchN(page->n_rows, [=] __device__(size_t idx) { + bst_node_t position = new_position_op(idx); + d_update_predictions[idx] = d_nodes[position].LeafValue(); + // FIXME(jiamingy): Doesn't work when sampling is used with external memory as + // the sampler compacts the gradient vector. + bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f; + sorted_position[idx] = is_sampled ? ~position : position; + }); } bool UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { From b86cb2930761651ed6769b20da18c92efb1cf1ff Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 2 Jun 2022 08:41:32 -0700 Subject: [PATCH 48/64] Reduce grid size --- src/tree/gpu_hist/row_partitioner.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 70920ffcc3b3..a8c1b09df161 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -147,7 +147,7 @@ void SortPositionBatchUnstable(const common::Span> ba CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize); constexpr int kBlockSize = 256; const int grid_size = - std::min(256, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); + std::min(128, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); SortPositionBatchUnstableKernel <<>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows); From c3944af9edc3a0a48bf429915784dd54a5c266ba Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sat, 4 Jun 2022 12:24:30 -0700 Subject: [PATCH 49/64] Tune items/thread --- src/tree/gpu_hist/row_partitioner.cuh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index a8c1b09df161..ad664d5cdab5 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -146,8 +146,10 @@ void SortPositionBatchUnstable(const common::Span> ba OpT op, cudaStream_t stream) { CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize); constexpr int kBlockSize = 256; - const int grid_size = - std::min(128, static_cast(xgboost::common::DivRoundUp(total_rows, kBlockSize))); + + // Value found by experimentation + const int kItemsThread = 12; + const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread); SortPositionBatchUnstableKernel <<>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows); From cdd134ac1bca6a2ba21ba7bca5b8335422f27f09 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 6 Jun 2022 04:59:15 -0700 Subject: [PATCH 50/64] FinalisePosition custom kernel --- src/tree/gpu_hist/row_partitioner.cu | 4 +- src/tree/gpu_hist/row_partitioner.cuh | 112 ++++++++++++++++++++++++-- src/tree/updater_gpu_hist.cu | 33 ++++---- 3 files changed, 125 insertions(+), 24 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index cc117ae743e0..53b6039dabd3 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -14,7 +14,7 @@ namespace tree { RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); - ridx_segments_.emplace_back(Segment(0, num_rows)); + ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); dh::safe_cuda(cudaStreamCreate(&stream_)); } @@ -26,7 +26,7 @@ RowPartitioner::~RowPartitioner() { common::Span RowPartitioner::GetRows( bst_node_t nidx) { - auto segment = ridx_segments_.at(nidx); + auto segment = ridx_segments_.at(nidx).segment; // Return empty span here as a valid result // Will error if we try to construct a span from a pointer with size 0 if (segment.Size() == 0) { diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index ad664d5cdab5..6461f0f98cd6 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -158,12 +158,54 @@ void SortPositionBatchUnstable(const common::Span> ba <<>>(batch_info, ridx, ridx_tmp, total_rows); } +struct NodePositionInfo { + Segment segment; + int left_child = -1; + int right_child = -1; + __device__ bool IsLeaf() { return left_child == -1; } +}; + +__device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const NodePositionInfo* d_node_info) { + int position = 0; + NodePositionInfo node = d_node_info[position]; + while (!node.IsLeaf()) { + NodePositionInfo left = d_node_info[node.left_child]; + NodePositionInfo right = d_node_info[node.right_child]; + if (idx >= left.segment.begin && idx < left.segment.end) { + position = node.left_child; + node = left; + } else if (idx >= right.segment.begin && idx < right.segment.end) { + position = node.right_child; + node = right; + } else { + KERNEL_CHECK(false); + } + } + return position; +} + +template +__global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel( + const common::Span d_node_info, + const common::Span d_ridx, common::Span d_out_position, OpT op, + IsSampledOpT is_sampled) { + bst_node_t* out_ptr = d_out_position.data(); + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size(); + idx += blockDim.x * gridDim.x) { + auto position = GetPositionFromSegments(idx, d_node_info.data()); + RowIndexT ridx = d_ridx.data()[idx]; + bst_node_t new_position = op(ridx, position); + out_ptr[ridx] = is_sampled(ridx) ? ~new_position : new_position; + } +} + /** \brief Class responsible for tracking subsets of rows as we add splits and * partition training rows into different leaf nodes. */ class RowPartitioner { public: using RowIndexT = bst_uint; + private: int device_idx_; /*! \brief In here if you want to find the rows belong to a node nid, first you need to @@ -174,7 +216,8 @@ class RowPartitioner { * node id -> segment -> indices of rows belonging to node */ /*! \brief Range of row index for each node, pointers into ridx below. */ - std::vector ridx_segments_; + + std::vector ridx_segments_; /*! \brief mapping for node id -> rows. * This looks like: * node id | 1 | 2 | @@ -223,8 +266,8 @@ class RowPartitioner { std::size_t total_rows = 0; for (int i = 0; i < nidx.size(); i++) { - h_batch_info[i] = {ridx_segments_.at(nidx.at(i)), op_data.at(i)}; - total_rows += ridx_segments_.at(nidx.at(i)).Size(); + h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)}; + total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); } dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), h_batch_info.size() * sizeof(PerNodeData), @@ -248,16 +291,73 @@ class RowPartitioner { // Update segments for (int i = 0; i < nidx.size(); i++) { - auto segment = ridx_segments_.at(nidx[i]); + auto segment = ridx_segments_.at(nidx[i]).segment; auto left_count = h_counts[i].first; CHECK_LE(left_count, segment.Size()); CHECK_GE(left_count, 0); ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), std::max(left_nidx[i], right_nidx[i]) + 1)); - ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count); - ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end); + ridx_segments_[nidx[i]] = NodePositionInfo{segment, left_nidx[i], right_nidx[i]}; + ridx_segments_[left_nidx[i]] = + NodePositionInfo{Segment(segment.begin, segment.begin + left_count)}; + ridx_segments_[right_nidx[i]] = + NodePositionInfo{Segment(segment.begin + left_count, segment.end)}; } } + + /** + * \brief Finalise the position of all training instances after tree construction is + * complete. Does not update any other meta information in this data structure, so + * should only be used at the end of training. + * + * When the task requires update leaf, this function will copy the node index into + * p_out_position. The index is negated if it's being sampled in current iteration. + * + * \param p_out_position Node index for each row. + * \param op Device lambda. Should provide the row index and current position as an + * argument and return the new position for this training instance. + * \param sampled A device lambda to inform the partitioner whether a row is sampled. + */ + template + void FinalisePosition( + common::Span d_out_position, FinalisePositionOpT op, + Sampledp sampledp) { + dh::TemporaryArray d_node_info_storage(ridx_segments_.size()); + dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(), + sizeof(NodePositionInfo) * ridx_segments_.size(), + cudaMemcpyDefault, stream_)); + + auto d_node_info = d_node_info_storage.data().get(); + + auto current_position = [=] __device__(std::size_t idx) { + int position = 0; + NodePositionInfo node = d_node_info[position]; + while (!node.IsLeaf()) { + NodePositionInfo left = d_node_info[node.left_child]; + NodePositionInfo right = d_node_info[node.right_child]; + if (idx >= left.segment.begin && idx < left.segment.end) { + position = node.left_child; + node = left; + } else if (idx >= right.segment.begin && idx < right.segment.end) { + position = node.right_child; + node = right; + } else { + KERNEL_CHECK(false); + } + } + return position; + }; + + constexpr int kBlockSize = 256; + + // Value found by experimentation + const int kItemsThread = 12; + const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread); + common::Span d_ridx(ridx_.data().get(), ridx_.size()); + FinalisePositionKernel<<>>( + dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op, sampledp); + } }; + }; // namespace tree }; // namespace xgboost diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3d1c38ba51d7..8dc8ff97b120 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -450,12 +450,16 @@ struct GPUHistMakerDevice { HostDeviceVector* p_out_position) { auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_gpair = this->gpair; - auto new_position_op = [=] __device__(size_t row_id) { + update_predictions.resize(row_partitioner->GetRows().size()); + auto d_update_predictions = dh::ToSpan(update_predictions); + p_out_position->SetDevice(ctx_->gpu_id); + p_out_position->Resize(row_partitioner->GetRows().size()); + + auto new_position_op = [=] __device__(size_t row_id, int position) { // What happens if user prune the tree? if (!d_matrix.IsInRange(row_id)) { return -1; } - int position = RegTree::kRoot; auto node = d_nodes[position]; while (!node.IsLeaf()) { @@ -478,24 +482,21 @@ struct GPUHistMakerDevice { position = node.RightChild(); } } + node = d_nodes[position]; } + d_update_predictions[row_id] = node.LeafValue(); return position; - }; // NOLINT - p_out_position->SetDevice(ctx_->gpu_id); - p_out_position->Resize(page->n_rows); - update_predictions.resize(page->n_rows); - auto d_update_predictions = dh::ToSpan(update_predictions); - auto sorted_position = p_out_position->DevicePointer(); - dh::LaunchN(page->n_rows, [=] __device__(size_t idx) { - bst_node_t position = new_position_op(idx); - d_update_predictions[idx] = d_nodes[position].LeafValue(); - // FIXME(jiamingy): Doesn't work when sampling is used with external memory as - // the sampler compacts the gradient vector. - bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f; - sorted_position[idx] = is_sampled ? ~position : position; - }); + }; // NOLINT + + auto is_sampled_op = [d_gpair] __device__(size_t ridx) { + // FIXME(jiamingy): Doesn't work when sampling is used with external memory as + // the sampler compacts the gradient vector. + return d_gpair[ridx].GetHess() - .0f == 0.f; + }; + + row_partitioner->FinalisePosition(p_out_position->DeviceSpan(), new_position_op, is_sampled_op); } bool UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { From edabc455063351761992d51d34178e642ae257a6 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 7 Jun 2022 13:31:56 -0700 Subject: [PATCH 51/64] Fixing slow scatter --- src/tree/gpu_hist/row_partitioner.cuh | 26 +++++++++---------- src/tree/updater_gpu_hist.cu | 15 ++++++----- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 3 ++- 3 files changed, 23 insertions(+), 21 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 6461f0f98cd6..e5a4a1849ab2 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -152,7 +152,7 @@ void SortPositionBatchUnstable(const common::Span> ba const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread); SortPositionBatchUnstableKernel - <<>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows); + <<>>(batch_info, ridx, ridx_tmp,d_counts, op, total_rows); SortPositionCopyKernel <<>>(batch_info, ridx, ridx_tmp, total_rows); @@ -184,18 +184,16 @@ __device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const No return position; } -template +template __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel( const common::Span d_node_info, - const common::Span d_ridx, common::Span d_out_position, OpT op, - IsSampledOpT is_sampled) { - bst_node_t* out_ptr = d_out_position.data(); + const common::Span d_ridx,common::Span d_out_position, OpT op) { for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size(); idx += blockDim.x * gridDim.x) { auto position = GetPositionFromSegments(idx, d_node_info.data()); - RowIndexT ridx = d_ridx.data()[idx]; + RowIndexT ridx = d_ridx[idx]; bst_node_t new_position = op(ridx, position); - out_ptr[ridx] = is_sampled(ridx) ? ~new_position : new_position; + d_out_position[ridx] = new_position; } } @@ -266,7 +264,8 @@ class RowPartitioner { std::size_t total_rows = 0; for (int i = 0; i < nidx.size(); i++) { - h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)}; + h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, + op_data.at(i)}; total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); } dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), @@ -280,7 +279,7 @@ class RowPartitioner { // Partition the rows according to the operator SortPositionBatchUnstable(common::Span>( d_batch_info.data().get(), d_batch_info.size()), - dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), + dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),dh::ToSpan(d_counts), total_rows, op, stream_); dh::safe_cuda( cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), @@ -318,10 +317,9 @@ class RowPartitioner { * argument and return the new position for this training instance. * \param sampled A device lambda to inform the partitioner whether a row is sampled. */ - template + template void FinalisePosition( - common::Span d_out_position, FinalisePositionOpT op, - Sampledp sampledp) { + common::Span d_out_position, FinalisePositionOpT op) { dh::TemporaryArray d_node_info_storage(ridx_segments_.size()); dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(), sizeof(NodePositionInfo) * ridx_segments_.size(), @@ -351,11 +349,11 @@ class RowPartitioner { constexpr int kBlockSize = 256; // Value found by experimentation - const int kItemsThread = 12; + const int kItemsThread = 8; const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread); common::Span d_ridx(ridx_.data().get(), ridx_.size()); FinalisePositionKernel<<>>( - dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op, sampledp); + dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op); } }; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 8dc8ff97b120..b0c9fbaab029 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -401,6 +401,7 @@ struct GPUHistMakerDevice { } return go_left; }); + } // After tree update is finished, update the position of all training @@ -490,13 +491,15 @@ struct GPUHistMakerDevice { return position; }; // NOLINT - auto is_sampled_op = [d_gpair] __device__(size_t ridx) { - // FIXME(jiamingy): Doesn't work when sampling is used with external memory as - // the sampler compacts the gradient vector. - return d_gpair[ridx].GetHess() - .0f == 0.f; - }; + auto d_out_position = p_out_position->DeviceSpan(); + row_partitioner->FinalisePosition(d_out_position, new_position_op); - row_partitioner->FinalisePosition(p_out_position->DeviceSpan(), new_position_op, is_sampled_op); + dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) { + bst_node_t position = d_out_position[idx]; + d_update_predictions[idx] = d_nodes[position].LeafValue(); + bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f; + d_out_position[idx] = is_row_sampled ? ~position : position; + }); } bool UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 92bccff35330..ccde627bd9ea 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -72,7 +72,8 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector>(d_batch_info.data().get(), d_batch_info.size()), - dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op, nullptr); + dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, + op, nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; for (int i = 0; i < segments.size(); i++) { From 43eb83e6c9cfe214d67ce8f22f6d2b7f14c65e3c Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 13 Jun 2022 02:38:08 -0700 Subject: [PATCH 52/64] Remove unstable --- src/tree/gpu_hist/row_partitioner.cuh | 194 ++++++++++-------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 12 +- 2 files changed, 111 insertions(+), 95 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index e5a4a1849ab2..ee9884a3bf36 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -17,8 +17,8 @@ namespace tree { /** \brief Used to demarcate a contiguous set of row indices associated with * some tree node. */ struct Segment { - size_t begin{0}; - size_t end{0}; + uint32_t begin{0}; + uint32_t end{0}; Segment() = default; @@ -36,11 +36,13 @@ struct PerNodeData { OpDataT data; }; -template -__device__ __forceinline__ void AssignBatch(const PerNodeData *batch_info, +__constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256]; + +template +__device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) { - std::size_t sum = 0; - for (int16_t i = 0; i < kMaxUpdatePositionBatchSize; i++) { + uint32_t sum = 0; + for (int i = 0; i < kMaxUpdatePositionBatchSize; i++) { if (sum + batch_info[i].segment.Size() > global_thread_idx) { *batch_idx = i; *item_idx = (global_thread_idx - sum) + batch_info[i].segment.begin; @@ -50,36 +52,6 @@ __device__ __forceinline__ void AssignBatch(const PerNodeData *batch_in } } -__forceinline__ __device__ uint32_t __lanemask_lt() { return ((uint32_t)1 << cub::LaneId()) - 1; } - -__forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, bool go_left, - int16_t batch_idx) { - int mask = __activemask(); - int leader = __ffs(mask) - 1; - uint32_t prefix = __popc(mask & __lanemask_lt()); - bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader)); - // If all threads here are working on the same node - // we can do a more efficient reduction with warp intrinsics - if (group_is_contiguous) { - uint32_t ballot = __ballot_sync(mask, go_left); - uint32_t global_left_count = 0; - uint32_t global_right_count = 0; - if (prefix == 0) { - global_left_count = atomicAdd(&d_counts->first, __popc(ballot)); - global_right_count = atomicAdd(&d_counts->second, __popc(mask) - __popc(ballot)); - } - global_left_count = __shfl_sync(mask, global_left_count, leader); - global_right_count = __shfl_sync(mask, global_right_count, leader); - uint32_t local_left_count = __popc(ballot & __lanemask_lt()); - uint32_t local_right_count = __popc(mask & __lanemask_lt()) - local_left_count; - - return go_left ? global_left_count + local_left_count : global_right_count + local_right_count; - - } else { - auto address = go_left ? &d_counts->first : &d_counts->second; - return atomicAdd(address, 1); - } -} template struct SharedStorage { @@ -87,75 +59,122 @@ struct SharedStorage { // Collectively load from global memory into shared memory template __device__ const PerNodeData* BlockLoad( - const common::Span> d_batch_info) { - for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) { - data[i] = d_batch_info.data()[i]; + const PerNodeData* d_batch_info) { + for (int i = threadIdx.x; i < kMaxUpdatePositionBatchSize; i += kBlockSize) { + data[i] = d_batch_info[i]; } __syncthreads(); return data; } }; -template -__global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel( - const common::Span> d_batch_info, common::Span d_ridx, - common::Span ridx_tmp, common::Span counts, OpT op, - std::size_t total_rows) { - // Initialise shared memory this way to avoid calling constructors - __shared__ cub::Uninitialized> shared; - auto batch_info = shared.Alias().BlockLoad(d_batch_info); - - for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; - idx += blockDim.x * gridDim.x) { - int batch_idx; - std::size_t item_idx; - AssignBatch(batch_info, idx, &batch_idx, &item_idx); - auto ridx = d_ridx[item_idx]; - auto op_res = op(ridx, batch_info[batch_idx].data); - auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx); - auto segment = batch_info[batch_idx].segment; - auto destination_address = - op_res ? segment.begin + current_num_items : segment.end - current_num_items - 1; - ridx_tmp[destination_address] = ridx; - } -} - template __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel( - const common::Span> d_batch_info, common::Span d_ridx, - common::Span ridx_tmp, + common::Span d_ridx, const common::Span ridx_tmp, std::size_t total_rows) { - + // Load this into shared memory + // the compiler puts it into registers otherwise + // then we get spilling to local memory + const PerNodeData* batch_info = + reinterpret_cast*>(constant_memory); __shared__ cub::Uninitialized> shared; - auto batch_info = shared.Alias().BlockLoad(d_batch_info); - + auto s_batch_info = shared.Alias().BlockLoad(batch_info); + for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) { int batch_idx; std::size_t item_idx; - AssignBatch(batch_info, idx,&batch_idx, &item_idx); + AssignBatch(s_batch_info, idx, &batch_idx, &item_idx); d_ridx[item_idx] = ridx_tmp[item_idx]; } } +// We can scan over this tuple, where the scan gives us information on how to partition inputs +// according to the flag +struct IndexFlagTuple { + bst_uint idx; // The location of the item we are working on in ridx_ + bst_uint flag_scan; // This gets populated after scanning + int batch_idx; // Which node in the batch does this item belong to + bool flag; // Result of op (is this item going left?) +}; + +struct IndexFlagOp { + __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const { + // Segmented scan - resets if we cross batch boundaries + if (a.batch_idx == b.batch_idx) { + // Accumulate the flags, everything else stays the same + return {b.idx, a.flag_scan + b.flag_scan, b.batch_idx, b.flag}; + } else { + return b; + } + } +}; + +template +struct WriteResultsFunctor { + const bst_uint* ridx_in; + bst_uint* ridx_out; + PartitionCountsT *counts; + + __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { + std::size_t scatter_address; + const PerNodeData* batch_info = + reinterpret_cast*>(constant_memory); + const Segment& segment = batch_info[x.batch_idx].segment; + if (x.flag) { + bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan + scatter_address = segment.begin + num_previous_flagged; + } else { + bst_uint num_previous_unflagged = (x.idx - segment.begin) - x.flag_scan; + scatter_address = segment.end - num_previous_unflagged - 1; + } + ridx_out[scatter_address] = ridx_in[x.idx]; + + if (x.idx == (segment.end - 1)) { + // Write out counts + counts[x.batch_idx] = {x.flag_scan,0}; + } + + // Discard + return {}; + } +}; + template -void SortPositionBatchUnstable(const common::Span> batch_info, - common::Span ridx, common::Span ridx_tmp, - common::Span d_counts, std::size_t total_rows, - OpT op, cudaStream_t stream) { - CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize); +void SortPositionBatch( + common::Span ridx, common::Span ridx_tmp, + common::Span d_counts, std::size_t total_rows, + OpT op, cudaStream_t stream) { + WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()}; + + auto discard_write_iterator = + thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); + auto counting = thrust::make_counting_iterator(0llu); + auto input_iterator = + dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { + const PerNodeData* batch_info_itr = + reinterpret_cast*>(constant_memory); + int batch_idx; + std::size_t item_idx; + AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx); + auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data); + return IndexFlagTuple{bst_uint(item_idx), op_res, batch_idx, op_res}; + }); + size_t temp_bytes = 0; + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, + IndexFlagOp(), total_rows, stream); + dh::TemporaryArray temp(temp_bytes); + cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, + discard_write_iterator, IndexFlagOp(), total_rows, stream); + constexpr int kBlockSize = 256; // Value found by experimentation const int kItemsThread = 12; const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread); - SortPositionBatchUnstableKernel - <<>>(batch_info, ridx, ridx_tmp,d_counts, op, total_rows); - - SortPositionCopyKernel - <<>>(batch_info, ridx, ridx_tmp, total_rows); + SortPositionCopyKernel + <<>>(ridx, ridx_tmp,total_rows); } struct NodePositionInfo { @@ -260,7 +279,6 @@ class RowPartitioner { CHECK_EQ(nidx.size(), op_data.size()); auto h_batch_info = pinned2_.GetSpan>(nidx.size()); - dh::TemporaryArray> d_batch_info(nidx.size()); std::size_t total_rows = 0; for (int i = 0; i < nidx.size(); i++) { @@ -268,17 +286,18 @@ class RowPartitioner { op_data.at(i)}; total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); } - dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), - h_batch_info.size() * sizeof(PerNodeData), - cudaMemcpyDefault, stream_)); + static_assert(sizeof(PerNodeData) * kMaxUpdatePositionBatchSize <= + sizeof(constant_memory)); + dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(), + h_batch_info.size() * sizeof(PerNodeData), 0, + cudaMemcpyDefault, stream_)); // Temporary arrays auto h_counts = pinned_.GetSpan(nidx.size(), PartitionCountsT{}); dh::TemporaryArray d_counts(nidx.size(), PartitionCountsT{}); // Partition the rows according to the operator - SortPositionBatchUnstable(common::Span>( - d_batch_info.data().get(), d_batch_info.size()), + SortPositionBatch( dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),dh::ToSpan(d_counts), total_rows, op, stream_); dh::safe_cuda( @@ -346,9 +365,8 @@ class RowPartitioner { return position; }; - constexpr int kBlockSize = 256; + constexpr int kBlockSize = 512; - // Value found by experimentation const int kItemsThread = 8; const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread); common::Span d_ridx(ridx_.data().get(), ridx_.size()); diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index ccde627bd9ea..0061fdb121d6 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -67,13 +67,11 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector), - cudaMemcpyDefault, nullptr)); - SortPositionBatchUnstable( - common::Span>(d_batch_info.data().get(), d_batch_info.size()), - dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, - op, nullptr); + dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(), + h_batch_info.size() * sizeof(PerNodeData), 0, + cudaMemcpyDefault, nullptr)); + SortPositionBatch(dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), + dh::ToSpan(counts), total_rows, op, nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; for (int i = 0; i < segments.size(); i++) { From 968bb29d3ef39115f674df812e94574cebec964d Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 14 Jun 2022 03:35:28 -0700 Subject: [PATCH 53/64] Format --- src/tree/gpu_hist/row_partitioner.cuh | 79 +++++++++++++-------------- 1 file changed, 37 insertions(+), 42 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index ee9884a3bf36..876cf2122920 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -2,19 +2,21 @@ * Copyright 2017-2022 XGBoost contributors */ #pragma once +#include + #include #include -#include "xgboost/base.h" + #include "../../common/device_helpers.cuh" +#include "xgboost/base.h" #include "xgboost/generic_parameters.h" #include "xgboost/task.h" #include "xgboost/tree_model.h" -#include namespace xgboost { namespace tree { - /** \brief Used to demarcate a contiguous set of row indices associated with +/** \brief Used to demarcate a contiguous set of row indices associated with * some tree node. */ struct Segment { uint32_t begin{0}; @@ -26,7 +28,7 @@ struct Segment { __host__ __device__ size_t Size() const { return end - begin; } }; -using PartitionCountsT = thrust::pair; +using PartitionCountsT = thrust::pair; // TODO(Rory): Can be larger. To be tuned alongside other batch operations. static const int kMaxUpdatePositionBatchSize = 32; @@ -39,8 +41,8 @@ struct PerNodeData { __constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256]; template -__device__ __forceinline__ void AssignBatch(BatchIterT batch_info, - std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) { +__device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx, + int* batch_idx, std::size_t* item_idx) { uint32_t sum = 0; for (int i = 0; i < kMaxUpdatePositionBatchSize; i++) { if (sum + batch_info[i].segment.Size() > global_thread_idx) { @@ -52,14 +54,12 @@ __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, } } - template struct SharedStorage { PerNodeData data[kMaxUpdatePositionBatchSize]; // Collectively load from global memory into shared memory template - __device__ const PerNodeData* BlockLoad( - const PerNodeData* d_batch_info) { + __device__ const PerNodeData* BlockLoad(const PerNodeData* d_batch_info) { for (int i = threadIdx.x; i < kMaxUpdatePositionBatchSize; i += kBlockSize) { data[i] = d_batch_info[i]; } @@ -92,10 +92,10 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel( // We can scan over this tuple, where the scan gives us information on how to partition inputs // according to the flag struct IndexFlagTuple { - bst_uint idx; // The location of the item we are working on in ridx_ - bst_uint flag_scan; // This gets populated after scanning + bst_uint idx; // The location of the item we are working on in ridx_ + bst_uint flag_scan; // This gets populated after scanning int batch_idx; // Which node in the batch does this item belong to - bool flag; // Result of op (is this item going left?) + bool flag; // Result of op (is this item going left?) }; struct IndexFlagOp { @@ -114,7 +114,7 @@ template struct WriteResultsFunctor { const bst_uint* ridx_in; bst_uint* ridx_out; - PartitionCountsT *counts; + PartitionCountsT* counts; __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { std::size_t scatter_address; @@ -122,7 +122,7 @@ struct WriteResultsFunctor { reinterpret_cast*>(constant_memory); const Segment& segment = batch_info[x.batch_idx].segment; if (x.flag) { - bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan + bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan scatter_address = segment.begin + num_previous_flagged; } else { bst_uint num_previous_unflagged = (x.idx - segment.begin) - x.flag_scan; @@ -132,7 +132,7 @@ struct WriteResultsFunctor { if (x.idx == (segment.end - 1)) { // Write out counts - counts[x.batch_idx] = {x.flag_scan,0}; + counts[x.batch_idx] = {x.flag_scan, 0}; } // Discard @@ -141,10 +141,9 @@ struct WriteResultsFunctor { }; template -void SortPositionBatch( - common::Span ridx, common::Span ridx_tmp, - common::Span d_counts, std::size_t total_rows, - OpT op, cudaStream_t stream) { +void SortPositionBatch(common::Span ridx, common::Span ridx_tmp, + common::Span d_counts, std::size_t total_rows, OpT op, + cudaStream_t stream) { WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()}; auto discard_write_iterator = @@ -173,8 +172,8 @@ void SortPositionBatch( const int kItemsThread = 12; const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread); - SortPositionCopyKernel - <<>>(ridx, ridx_tmp,total_rows); + SortPositionCopyKernel + <<>>(ridx, ridx_tmp, total_rows); } struct NodePositionInfo { @@ -184,7 +183,8 @@ struct NodePositionInfo { __device__ bool IsLeaf() { return left_child == -1; } }; -__device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const NodePositionInfo* d_node_info) { +__device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, + const NodePositionInfo* d_node_info) { int position = 0; NodePositionInfo node = d_node_info[position]; while (!node.IsLeaf()) { @@ -206,7 +206,7 @@ __device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const No template __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel( const common::Span d_node_info, - const common::Span d_ridx,common::Span d_out_position, OpT op) { + const common::Span d_ridx, common::Span d_out_position, OpT op) { for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size(); idx += blockDim.x * gridDim.x) { auto position = GetPositionFromSegments(idx, d_node_info.data()); @@ -222,7 +222,6 @@ class RowPartitioner { public: using RowIndexT = bst_uint; - private: int device_idx_; /*! \brief In here if you want to find the rows belong to a node nid, first you need to @@ -282,8 +281,7 @@ class RowPartitioner { std::size_t total_rows = 0; for (int i = 0; i < nidx.size(); i++) { - h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, - op_data.at(i)}; + h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)}; total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); } static_assert(sizeof(PerNodeData) * kMaxUpdatePositionBatchSize <= @@ -297,13 +295,11 @@ class RowPartitioner { dh::TemporaryArray d_counts(nidx.size(), PartitionCountsT{}); // Partition the rows according to the operator - SortPositionBatch( - dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),dh::ToSpan(d_counts), - total_rows, op, stream_); - dh::safe_cuda( - cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), - sizeof(decltype(d_counts)::value_type) * d_counts.size(), - cudaMemcpyDefault, stream_)); + SortPositionBatch( + dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, stream_); + dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), + sizeof(decltype(d_counts)::value_type) * d_counts.size(), + cudaMemcpyDefault, stream_)); dh::safe_cuda(cudaStreamSynchronize(stream_)); @@ -323,7 +319,7 @@ class RowPartitioner { } } - /** + /** * \brief Finalise the position of all training instances after tree construction is * complete. Does not update any other meta information in this data structure, so * should only be used at the end of training. @@ -337,8 +333,7 @@ class RowPartitioner { * \param sampled A device lambda to inform the partitioner whether a row is sampled. */ template - void FinalisePosition( - common::Span d_out_position, FinalisePositionOpT op) { + void FinalisePosition(common::Span d_out_position, FinalisePositionOpT op) { dh::TemporaryArray d_node_info_storage(ridx_segments_.size()); dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(), sizeof(NodePositionInfo) * ridx_segments_.size(), @@ -365,13 +360,13 @@ class RowPartitioner { return position; }; - constexpr int kBlockSize = 512; + constexpr int kBlockSize = 512; - const int kItemsThread = 8; - const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread); - common::Span d_ridx(ridx_.data().get(), ridx_.size()); - FinalisePositionKernel<<>>( - dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op); + const int kItemsThread = 8; + const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread); + common::Span d_ridx(ridx_.data().get(), ridx_.size()); + FinalisePositionKernel<<>>( + dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op); } }; From 1372ad856ba80b5b558ae3850406f64b211bd453 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 15 Jun 2022 08:33:31 -0700 Subject: [PATCH 54/64] Review comments --- cmake/Utils.cmake | 1 - src/tree/gpu_hist/row_partitioner.cu | 15 ++-- src/tree/gpu_hist/row_partitioner.cuh | 72 ++++++++----------- src/tree/updater_gpu_hist.cu | 15 ++-- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 10 +-- 5 files changed, 47 insertions(+), 66 deletions(-) diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 10f0c8104a07..cbc11feb49b6 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -136,7 +136,6 @@ function(xgboost_set_cuda_flags target) target_compile_options(${target} PRIVATE $<$:--expt-extended-lambda> $<$:--expt-relaxed-constexpr> - $<$:-lineinfo> $<$:${GEN_CODE}> $<$:-Xcompiler=${OpenMP_CXX_FLAGS}> $<$:-Xfatbin=-compress-all>) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 53b6039dabd3..015d817f3640 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -1,10 +1,12 @@ /*! - * Copyright 2017-2021 XGBoost contributors + * Copyright 2017-2022 XGBoost contributors */ #include #include #include + #include + #include "../../common/device_helpers.cuh" #include "row_partitioner.cuh" @@ -24,14 +26,8 @@ RowPartitioner::~RowPartitioner() { dh::safe_cuda(cudaStreamDestroy(stream_)); } -common::Span RowPartitioner::GetRows( - bst_node_t nidx) { +common::Span RowPartitioner::GetRows(bst_node_t nidx) { auto segment = ridx_segments_.at(nidx).segment; - // Return empty span here as a valid result - // Will error if we try to construct a span from a pointer with size 0 - if (segment.Size() == 0) { - return {}; - } return dh::ToSpan(ridx_).subspan(segment.begin, segment.Size()); } @@ -39,8 +35,7 @@ common::Span RowPartitioner::GetRows() { return dh::ToSpan(ridx_); } -std::vector RowPartitioner::GetRowsHost( - bst_node_t nidx) { +std::vector RowPartitioner::GetRowsHost(bst_node_t nidx) { auto span = GetRows(nidx); std::vector rows(span.size()); dh::CopyDeviceSpanToVector(&rows, span); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 876cf2122920..d27b4aa65551 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -19,17 +19,15 @@ namespace tree { /** \brief Used to demarcate a contiguous set of row indices associated with * some tree node. */ struct Segment { - uint32_t begin{0}; - uint32_t end{0}; + bst_uint begin{0}; + bst_uint end{0}; Segment() = default; - Segment(size_t begin, size_t end) : begin(begin), end(end) { CHECK_GE(end, begin); } + Segment(bst_uint begin, bst_uint end) : begin(begin), end(end) { CHECK_GE(end, begin); } __host__ __device__ size_t Size() const { return end - begin; } }; -using PartitionCountsT = thrust::pair; - // TODO(Rory): Can be larger. To be tuned alongside other batch operations. static const int kMaxUpdatePositionBatchSize = 32; template @@ -43,7 +41,7 @@ __constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256]; template __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) { - uint32_t sum = 0; + bst_uint sum = 0; for (int i = 0; i < kMaxUpdatePositionBatchSize; i++) { if (sum + batch_info[i].segment.Size() > global_thread_idx) { *batch_idx = i; @@ -80,8 +78,7 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel( __shared__ cub::Uninitialized> shared; auto s_batch_info = shared.Alias().BlockLoad(batch_info); - for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; - idx += blockDim.x * gridDim.x) { + for (auto idx : dh::GridStrideRange(0, total_rows)) { int batch_idx; std::size_t item_idx; AssignBatch(s_batch_info, idx, &batch_idx, &item_idx); @@ -114,7 +111,7 @@ template struct WriteResultsFunctor { const bst_uint* ridx_in; bst_uint* ridx_out; - PartitionCountsT* counts; + bst_uint* counts; __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { std::size_t scatter_address; @@ -132,7 +129,7 @@ struct WriteResultsFunctor { if (x.idx == (segment.end - 1)) { // Write out counts - counts[x.batch_idx] = {x.flag_scan, 0}; + counts[x.batch_idx] = x.flag_scan; } // Discard @@ -142,7 +139,7 @@ struct WriteResultsFunctor { template void SortPositionBatch(common::Span ridx, common::Span ridx_tmp, - common::Span d_counts, std::size_t total_rows, OpT op, + common::Span d_counts, std::size_t total_rows, OpT op, cudaStream_t stream) { WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()}; @@ -178,8 +175,8 @@ void SortPositionBatch(common::Span ridx, common::Span rid struct NodePositionInfo { Segment segment; - int left_child = -1; - int right_child = -1; + bst_node_t left_child = -1; + bst_node_t right_child = -1; __device__ bool IsLeaf() { return left_child == -1; } }; @@ -207,8 +204,7 @@ template __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel( const common::Span d_node_info, const common::Span d_ridx, common::Span d_out_position, OpT op) { - for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size(); - idx += blockDim.x * gridDim.x) { + for (auto idx : dh::GridStrideRange(0, d_ridx.size())) { auto position = GetPositionFromSegments(idx, d_node_info.data()); RowIndexT ridx = d_ridx[idx]; bst_node_t new_position = op(ridx, position); @@ -221,6 +217,7 @@ __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel( class RowPartitioner { public: using RowIndexT = bst_uint; + static constexpr bst_node_t kIgnoredTreePosition = -1; private: int device_idx_; @@ -267,6 +264,20 @@ class RowPartitioner { */ std::vector GetRowsHost(bst_node_t nidx); + /** + * \brief Updates the tree position for set of training instances being split + * into left and right child nodes. Accepts a user-defined lambda specifying + * which branch each training instance should go down. + * + * \tparam UpdatePositionOpT + * \tparam OpDataT + * \param nidx The index of the nodes being split. + * \param left_nidx The left child indices. + * \param right_nidx The right child indices. + * \param op_data User-defined data provided as the second argument to op + * \param op Device lambda with the row index as the first argument and op_data as the + * second. Returns true if this training instance goes on the left partition. + */ template void UpdatePositionBatch(const std::vector& nidx, const std::vector& left_nidx, @@ -291,8 +302,8 @@ class RowPartitioner { cudaMemcpyDefault, stream_)); // Temporary arrays - auto h_counts = pinned_.GetSpan(nidx.size(), PartitionCountsT{}); - dh::TemporaryArray d_counts(nidx.size(), PartitionCountsT{}); + auto h_counts = pinned_.GetSpan(nidx.size(), 0); + dh::TemporaryArray d_counts(nidx.size(), 0); // Partition the rows according to the operator SortPositionBatch( @@ -300,13 +311,14 @@ class RowPartitioner { dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), sizeof(decltype(d_counts)::value_type) * d_counts.size(), cudaMemcpyDefault, stream_)); - + // TODO(Rory): this synchronisation hurts performance a lot + // Future optimisation should find a way to skip this dh::safe_cuda(cudaStreamSynchronize(stream_)); // Update segments for (int i = 0; i < nidx.size(); i++) { auto segment = ridx_segments_.at(nidx[i]).segment; - auto left_count = h_counts[i].first; + auto left_count = h_counts[i]; CHECK_LE(left_count, segment.Size()); CHECK_GE(left_count, 0); ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), @@ -339,29 +351,7 @@ class RowPartitioner { sizeof(NodePositionInfo) * ridx_segments_.size(), cudaMemcpyDefault, stream_)); - auto d_node_info = d_node_info_storage.data().get(); - - auto current_position = [=] __device__(std::size_t idx) { - int position = 0; - NodePositionInfo node = d_node_info[position]; - while (!node.IsLeaf()) { - NodePositionInfo left = d_node_info[node.left_child]; - NodePositionInfo right = d_node_info[node.right_child]; - if (idx >= left.segment.begin && idx < left.segment.end) { - position = node.left_child; - node = left; - } else if (idx >= right.segment.begin && idx < right.segment.end) { - position = node.right_child; - node = right; - } else { - KERNEL_CHECK(false); - } - } - return position; - }; - constexpr int kBlockSize = 512; - const int kItemsThread = 8; const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread); common::Span d_ridx(ridx_.data().get(), ridx_.size()); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index e461ae4f362e..e41c1d31514a 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -378,7 +378,7 @@ struct GPUHistMakerDevice { nidx.at(i) = e.nid; left_nidx.at(i) = split_node.LeftChild(); right_nidx.at(i) = split_node.RightChild(); - split_data.at(i) = NodeSplitData{ split_node, split_type, e.split.split_cats }; + split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats}; } auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); @@ -401,7 +401,6 @@ struct GPUHistMakerDevice { } return go_left; }); - } // After tree update is finished, update the position of all training @@ -459,7 +458,7 @@ struct GPUHistMakerDevice { auto new_position_op = [=] __device__(size_t row_id, int position) { // What happens if user prune the tree? if (!d_matrix.IsInRange(row_id)) { - return -1; + return RowPartitioner::kIgnoredTreePosition; } auto node = d_nodes[position]; @@ -483,7 +482,7 @@ struct GPUHistMakerDevice { position = node.RightChild(); } } - + node = d_nodes[position]; } @@ -502,17 +501,15 @@ struct GPUHistMakerDevice { }); } - bool UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { + void UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { CHECK(p_tree); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id); auto d_update_predictions = dh::ToSpan(update_predictions); - if (d_update_predictions.empty()) return false; CHECK_EQ(out_preds_d.Size(), d_update_predictions.size()); dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable { out_preds_d(idx) += d_update_predictions[idx]; }); - return true; } // num histograms is the number of contiguous histograms in memory to reduce over @@ -844,9 +841,9 @@ class GPUHistMaker : public TreeUpdater { return false; } monitor_.Start("UpdatePredictionCache"); - auto result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_); + maker->UpdatePredictionCache(p_out_preds, p_last_tree_); monitor_.Stop("UpdatePredictionCache"); - return result; + return true; } TrainParam param_; // NOLINT diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 0061fdb121d6..8ad85779cc77 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -55,7 +55,7 @@ TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); } void TestSortPositionBatch(const std::vector& ridx_in, const std::vector& segments) { thrust::device_vector ridx = ridx_in; thrust::device_vector ridx_tmp(ridx_in.size()); - thrust::device_vector counts(segments.size()); + thrust::device_vector counts(segments.size()); auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; }; std::vector op_data(segments.size()); @@ -77,12 +77,12 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector Date: Fri, 17 Jun 2022 02:59:54 -0700 Subject: [PATCH 55/64] Reintroduce prediction caching for external memory. --- src/tree/updater_gpu_hist.cu | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index e41c1d31514a..a81129c64120 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -413,11 +413,6 @@ struct GPUHistMakerDevice { LOG(FATAL) << "Current objective function can not be used with subsampled external memory."; } - // External memory will not use prediction cache - if (!p_fmat->SingleColBlock()) { - return; - } - dh::TemporaryArray d_nodes(p_tree->GetNodes().size()); dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(), d_nodes.size() * sizeof(RegTree::Node), @@ -436,9 +431,25 @@ struct GPUHistMakerDevice { dh::CopyToD(categories_segments, &d_categories_segments); } - FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), - dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task, - p_out_position); + if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) { + row_partitioner.reset(); // Release the device memory first before reallocating + row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_)); + } + if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) { + // see comment in the `FinalisePositionInPage`. + LOG(FATAL) << "Current objective function can not be used with subsampled external memory."; + } + if (page->n_rows == p_fmat->Info().num_row_) { + FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), + dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), + p_out_position); + } else { + for (auto const& batch : p_fmat->GetBatches(batch_param)) { + FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), + dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), + p_out_position); + } + } } void FinalisePositionInPage(EllpackPageImpl const *page, @@ -446,7 +457,6 @@ struct GPUHistMakerDevice { common::Span d_feature_types, common::Span categories, common::Span categories_segments, - ObjInfo task, HostDeviceVector* p_out_position) { auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto d_gpair = this->gpair; From ff05df532b82c0ddf08e338efa43d7aeb89feea5 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Sun, 19 Jun 2022 06:09:05 -0700 Subject: [PATCH 56/64] Avoid initialising temp memory --- src/tree/gpu_hist/row_partitioner.cu | 2 +- src/tree/gpu_hist/row_partitioner.cuh | 21 ++++++++++++------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 3 ++- 3 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 015d817f3640..87d3ad62e003 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -14,7 +14,7 @@ namespace xgboost { namespace tree { RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { + : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows),d_counts(kMaxUpdatePositionBatchSize) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index d27b4aa65551..a9cca72f4ce1 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -140,7 +140,7 @@ struct WriteResultsFunctor { template void SortPositionBatch(common::Span ridx, common::Span ridx_tmp, common::Span d_counts, std::size_t total_rows, OpT op, - cudaStream_t stream) { + dh::device_vector* tmp, cudaStream_t stream) { WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()}; auto discard_write_iterator = @@ -157,10 +157,13 @@ void SortPositionBatch(common::Span ridx, common::Span rid return IndexFlagTuple{bst_uint(item_idx), op_res, batch_idx, op_res}; }); size_t temp_bytes = 0; - cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, - IndexFlagOp(), total_rows, stream); - dh::TemporaryArray temp(temp_bytes); - cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator, + if (tmp->empty()) { + cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator, + IndexFlagOp(), total_rows, stream); + tmp->resize(temp_bytes); + } + temp_bytes = tmp->size(); + cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator, discard_write_iterator, IndexFlagOp(), total_rows, stream); constexpr int kBlockSize = 256; @@ -239,6 +242,8 @@ class RowPartitioner { dh::TemporaryArray ridx_; // Staging area for sorting ridx dh::TemporaryArray ridx_tmp_; + dh::TemporaryArray d_counts; + dh::device_vector tmp; dh::PinnedMemory pinned_; dh::PinnedMemory pinned2_; cudaStream_t stream_; @@ -303,13 +308,13 @@ class RowPartitioner { // Temporary arrays auto h_counts = pinned_.GetSpan(nidx.size(), 0); - dh::TemporaryArray d_counts(nidx.size(), 0); // Partition the rows according to the operator SortPositionBatch( - dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, stream_); + dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, &tmp, + stream_); dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), - sizeof(decltype(d_counts)::value_type) * d_counts.size(), + sizeof(decltype(d_counts)::value_type) * h_counts.size(), cudaMemcpyDefault, stream_)); // TODO(Rory): this synchronisation hurts performance a lot // Future optimisation should find a way to skip this diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index 8ad85779cc77..d35178c643c3 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -70,8 +70,9 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector), 0, cudaMemcpyDefault, nullptr)); + dh::device_vector tmp; SortPositionBatch(dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), - dh::ToSpan(counts), total_rows, op, nullptr); + dh::ToSpan(counts), total_rows, op, &tmp,nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; for (int i = 0; i < segments.size(); i++) { From 0280b8c21284344af83dd1207f911c6c7f1f0a98 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 23 Jun 2022 05:48:24 -0700 Subject: [PATCH 57/64] Lint --- src/tree/gpu_hist/row_partitioner.cu | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 87d3ad62e003..46cb67003962 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -14,7 +14,10 @@ namespace xgboost { namespace tree { RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows),d_counts(kMaxUpdatePositionBatchSize) { + : device_idx_(device_idx), + ridx_(num_rows), + ridx_tmp_(num_rows), + d_counts(kMaxUpdatePositionBatchSize) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); From 9c642dcaf8826ba3d8bc0f548fa3f6312d043960 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 24 Jun 2022 03:17:31 -0700 Subject: [PATCH 58/64] Review comments. --- src/tree/gpu_hist/row_partitioner.cu | 2 +- src/tree/gpu_hist/row_partitioner.cuh | 12 ++++++------ src/tree/updater_gpu_hist.cu | 2 ++ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 46cb67003962..540c07a6fe64 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -17,7 +17,7 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows), - d_counts(kMaxUpdatePositionBatchSize) { + d_counts_(kMaxUpdatePositionBatchSize) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index a9cca72f4ce1..3a42f9245a63 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -242,8 +242,8 @@ class RowPartitioner { dh::TemporaryArray ridx_; // Staging area for sorting ridx dh::TemporaryArray ridx_tmp_; - dh::TemporaryArray d_counts; - dh::device_vector tmp; + dh::TemporaryArray d_counts_; + dh::device_vector tmp_; dh::PinnedMemory pinned_; dh::PinnedMemory pinned2_; cudaStream_t stream_; @@ -301,7 +301,7 @@ class RowPartitioner { total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); } static_assert(sizeof(PerNodeData) * kMaxUpdatePositionBatchSize <= - sizeof(constant_memory)); + sizeof(constant_memory),"Not enough constant memory allocated.") ; dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(), h_batch_info.size() * sizeof(PerNodeData), 0, cudaMemcpyDefault, stream_)); @@ -311,10 +311,10 @@ class RowPartitioner { // Partition the rows according to the operator SortPositionBatch( - dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, &tmp, + dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_), total_rows, op, &tmp_, stream_); - dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), - sizeof(decltype(d_counts)::value_type) * h_counts.size(), + dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts_.data().get(), + sizeof(decltype(d_counts_)::value_type) * h_counts.size(), cudaMemcpyDefault, stream_)); // TODO(Rory): this synchronisation hurts performance a lot // Future optimisation should find a way to skip this diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index a81129c64120..aa65c16bae1d 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -507,6 +507,8 @@ struct GPUHistMakerDevice { bst_node_t position = d_out_position[idx]; d_update_predictions[idx] = d_nodes[position].LeafValue(); bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f; + // FIXME(jiamingy): Doesn't work when sampling is used with external memory as + // the sampler compacts the gradient vector. d_out_position[idx] = is_row_sampled ? ~position : position; }); } From b4f2128fc59fa320cc62646e90dd5b58103b5a07 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 27 Jun 2022 05:20:23 -0700 Subject: [PATCH 59/64] Remove external memory prediction caching. --- src/tree/updater_gpu_hist.cu | 42 +++++++++++++----------------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index aa65c16bae1d..7074359131df 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -408,9 +408,11 @@ struct GPUHistMakerDevice { // prediction cache void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task, HostDeviceVector* p_out_position) { - if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) { - // see comment in the `FinalisePositionInPage`. - LOG(FATAL) << "Current objective function can not be used with subsampled external memory."; + // Prediction cache will not be used with external memory + if (!p_fmat->SingleColBlock()) { + p_out_position->Resize(0); + update_predictions.clear(); + return; } dh::TemporaryArray d_nodes(p_tree->GetNodes().size()); @@ -431,25 +433,9 @@ struct GPUHistMakerDevice { dh::CopyToD(categories_segments, &d_categories_segments); } - if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) { - row_partitioner.reset(); // Release the device memory first before reallocating - row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_)); - } - if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) { - // see comment in the `FinalisePositionInPage`. - LOG(FATAL) << "Current objective function can not be used with subsampled external memory."; - } - if (page->n_rows == p_fmat->Info().num_row_) { - FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), - dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), - p_out_position); - } else { - for (auto const& batch : p_fmat->GetBatches(batch_param)) { - FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), - dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), - p_out_position); - } - } + FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), + dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), + p_out_position); } void FinalisePositionInPage(EllpackPageImpl const *page, @@ -507,13 +493,14 @@ struct GPUHistMakerDevice { bst_node_t position = d_out_position[idx]; d_update_predictions[idx] = d_nodes[position].LeafValue(); bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f; - // FIXME(jiamingy): Doesn't work when sampling is used with external memory as - // the sampler compacts the gradient vector. d_out_position[idx] = is_row_sampled ? ~position : position; }); } - void UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { + bool UpdatePredictionCache(linalg::VectorView out_preds_d, RegTree const* p_tree) { + if (update_predictions.empty()) { + return false; + } CHECK(p_tree); dh::safe_cuda(cudaSetDevice(ctx_->gpu_id)); CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id); @@ -522,6 +509,7 @@ struct GPUHistMakerDevice { dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable { out_preds_d(idx) += d_update_predictions[idx]; }); + return true; } // num histograms is the number of contiguous histograms in memory to reduce over @@ -853,9 +841,9 @@ class GPUHistMaker : public TreeUpdater { return false; } monitor_.Start("UpdatePredictionCache"); - maker->UpdatePredictionCache(p_out_preds, p_last_tree_); + bool result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_); monitor_.Stop("UpdatePredictionCache"); - return true; + return result; } TrainParam param_; // NOLINT From 776ef9fb807dfc568fef0bdec561d7753886464b Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 28 Jun 2022 05:25:56 -0700 Subject: [PATCH 60/64] Remove constant memory in favour of __ldg(). --- src/common/device_helpers.cuh | 20 +++++++ src/tree/gpu_hist/row_partitioner.cuh | 57 ++++++------------- .../cpp/tree/gpu_hist/test_row_partitioner.cu | 11 ++-- 3 files changed, 42 insertions(+), 46 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 123dc14e57be..33989a230464 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1939,4 +1939,24 @@ class CUDAStream { CUDAStreamView View() const { return CUDAStreamView{stream_}; } void Sync() { this->View().Sync(); } }; + +// Force nvcc to load data as constant +template +class LDGIterator { + typedef typename cub::UnitWord::DeviceWord DeviceWordT; + static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); + + const T* ptr; + + public: + LDGIterator(const T* ptr) : ptr(ptr) {} + __device__ T operator[](std::size_t idx) const { + DeviceWordT tmp[kNumWords]; +#pragma unroll + for (int i = 0; i < kNumWords; i++) { + tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); + } + return *reinterpret_cast(tmp); + } +}; } // namespace dh diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index 3a42f9245a63..e9fb7e86add7 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -36,8 +36,6 @@ struct PerNodeData { OpDataT data; }; -__constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256]; - template __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) { @@ -52,36 +50,14 @@ __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t g } } -template -struct SharedStorage { - PerNodeData data[kMaxUpdatePositionBatchSize]; - // Collectively load from global memory into shared memory - template - __device__ const PerNodeData* BlockLoad(const PerNodeData* d_batch_info) { - for (int i = threadIdx.x; i < kMaxUpdatePositionBatchSize; i += kBlockSize) { - data[i] = d_batch_info[i]; - } - __syncthreads(); - return data; - } -}; - template __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel( - common::Span d_ridx, const common::Span ridx_tmp, - std::size_t total_rows) { - // Load this into shared memory - // the compiler puts it into registers otherwise - // then we get spilling to local memory - const PerNodeData* batch_info = - reinterpret_cast*>(constant_memory); - __shared__ cub::Uninitialized> shared; - auto s_batch_info = shared.Alias().BlockLoad(batch_info); - + dh::LDGIterator> batch_info, common::Span d_ridx, + const common::Span ridx_tmp, std::size_t total_rows) { for (auto idx : dh::GridStrideRange(0, total_rows)) { int batch_idx; std::size_t item_idx; - AssignBatch(s_batch_info, idx, &batch_idx, &item_idx); + AssignBatch(batch_info, idx, &batch_idx, &item_idx); d_ridx[item_idx] = ridx_tmp[item_idx]; } } @@ -109,14 +85,13 @@ struct IndexFlagOp { template struct WriteResultsFunctor { + dh::LDGIterator> batch_info; const bst_uint* ridx_in; bst_uint* ridx_out; bst_uint* counts; __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) { std::size_t scatter_address; - const PerNodeData* batch_info = - reinterpret_cast*>(constant_memory); const Segment& segment = batch_info[x.batch_idx].segment; if (x.flag) { bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan @@ -138,18 +113,19 @@ struct WriteResultsFunctor { }; template -void SortPositionBatch(common::Span ridx, common::Span ridx_tmp, +void SortPositionBatch(common::Span> d_batch_info, + common::Span ridx, common::Span ridx_tmp, common::Span d_counts, std::size_t total_rows, OpT op, dh::device_vector* tmp, cudaStream_t stream) { - WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()}; + dh::LDGIterator> batch_info_itr(d_batch_info.data()); + WriteResultsFunctor write_results{batch_info_itr, ridx.data(), ridx_tmp.data(), + d_counts.data()}; auto discard_write_iterator = thrust::make_transform_output_iterator(dh::TypedDiscard(), write_results); auto counting = thrust::make_counting_iterator(0llu); auto input_iterator = dh::MakeTransformIterator(counting, [=] __device__(size_t idx) { - const PerNodeData* batch_info_itr = - reinterpret_cast*>(constant_memory); int batch_idx; std::size_t item_idx; AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx); @@ -173,7 +149,7 @@ void SortPositionBatch(common::Span ridx, common::Span rid const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread); SortPositionCopyKernel - <<>>(ridx, ridx_tmp, total_rows); + <<>>(batch_info_itr, ridx, ridx_tmp, total_rows); } struct NodePositionInfo { @@ -294,25 +270,24 @@ class RowPartitioner { CHECK_EQ(nidx.size(), op_data.size()); auto h_batch_info = pinned2_.GetSpan>(nidx.size()); + dh::TemporaryArray> d_batch_info(nidx.size()); std::size_t total_rows = 0; for (int i = 0; i < nidx.size(); i++) { h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)}; total_rows += ridx_segments_.at(nidx.at(i)).segment.Size(); } - static_assert(sizeof(PerNodeData) * kMaxUpdatePositionBatchSize <= - sizeof(constant_memory),"Not enough constant memory allocated.") ; - dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(), - h_batch_info.size() * sizeof(PerNodeData), 0, - cudaMemcpyDefault, stream_)); + dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), + h_batch_info.size() * sizeof(PerNodeData), + cudaMemcpyDefault, stream_)); // Temporary arrays auto h_counts = pinned_.GetSpan(nidx.size(), 0); // Partition the rows according to the operator SortPositionBatch( - dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_), total_rows, op, &tmp_, - stream_); + dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_), + total_rows, op, &tmp_, stream_); dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts_.data().get(), sizeof(decltype(d_counts_)::value_type) * h_counts.size(), cudaMemcpyDefault, stream_)); diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu index d35178c643c3..520cc3cd0b81 100644 --- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu +++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu @@ -67,12 +67,13 @@ void TestSortPositionBatch(const std::vector& ridx_in, const std::vector), 0, - cudaMemcpyDefault, nullptr)); + dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(), + h_batch_info.size() * sizeof(PerNodeData), cudaMemcpyDefault, + nullptr)); dh::device_vector tmp; - SortPositionBatch(dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), - dh::ToSpan(counts), total_rows, op, &tmp,nullptr); + SortPositionBatch(dh::ToSpan(d_batch_info), dh::ToSpan(ridx), + dh::ToSpan(ridx_tmp), dh::ToSpan(counts), + total_rows, op, &tmp, nullptr); auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; }; for (int i = 0; i < segments.size(); i++) { From 33fea3dcbfc560c1643d641098d325e1788a7172 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 28 Jun 2022 05:43:46 -0700 Subject: [PATCH 61/64] Clang tidy --- src/common/device_helpers.cuh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 33989a230464..a0d94886da60 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1943,20 +1943,20 @@ class CUDAStream { // Force nvcc to load data as constant template class LDGIterator { - typedef typename cub::UnitWord::DeviceWord DeviceWordT; + using DeviceWordT = typename cub::UnitWord::DeviceWord; static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); - const T* ptr; + const T *ptr_; public: - LDGIterator(const T* ptr) : ptr(ptr) {} + LDGIterator(const T *ptr) : ptr_(ptr) {} __device__ T operator[](std::size_t idx) const { DeviceWordT tmp[kNumWords]; #pragma unroll for (int i = 0; i < kNumWords; i++) { - tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); + tmp[i] = __ldg(reinterpret_cast(ptr_ + idx) + i); } - return *reinterpret_cast(tmp); + return *reinterpret_cast(tmp); } }; } // namespace dh From 9de06928b38712e979de8252f38feed77388cb47 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 28 Jun 2022 05:58:59 -0700 Subject: [PATCH 62/64] Clang tidy --- src/common/device_helpers.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index a0d94886da60..738cf8de2106 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1949,7 +1949,7 @@ class LDGIterator { const T *ptr_; public: - LDGIterator(const T *ptr) : ptr_(ptr) {} + explicit LDGIterator(const T *ptr) : ptr_(ptr) {} __device__ T operator[](std::size_t idx) const { DeviceWordT tmp[kNumWords]; #pragma unroll From 3cd5e41b8e7776f06cc803cbbdd85c7c8abf3b4c Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 29 Jun 2022 06:08:32 -0700 Subject: [PATCH 63/64] Review comments. --- src/common/device_helpers.cuh | 1 + src/tree/updater_gpu_hist.cu | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 738cf8de2106..ccec859a286c 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1952,6 +1952,7 @@ class LDGIterator { explicit LDGIterator(const T *ptr) : ptr_(ptr) {} __device__ T operator[](std::size_t idx) const { DeviceWordT tmp[kNumWords]; + static_assert(sizeof(tmp) == sizeof(T), "Expect sizes to be equal."); #pragma unroll for (int i = 0; i < kNumWords; i++) { tmp[i] = __ldg(reinterpret_cast(ptr_ + idx) + i); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 7074359131df..5eaaeecbadf6 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -410,6 +410,9 @@ struct GPUHistMakerDevice { HostDeviceVector* p_out_position) { // Prediction cache will not be used with external memory if (!p_fmat->SingleColBlock()) { + if (task.UpdateTreeLeaf()) { + LOG(FATAL) << "Current objective function can not be used with external memory."; + } p_out_position->Resize(0); update_predictions.clear(); return; From 9eddfce0077de0db552d70833fe971896f842e02 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 30 Jun 2022 04:54:12 -0700 Subject: [PATCH 64/64] Initialise memory in case zero training rows. --- src/tree/gpu_hist/row_partitioner.cu | 5 +---- src/tree/gpu_hist/row_partitioner.cuh | 8 +++----- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu index 540c07a6fe64..015d817f3640 100644 --- a/src/tree/gpu_hist/row_partitioner.cu +++ b/src/tree/gpu_hist/row_partitioner.cu @@ -14,10 +14,7 @@ namespace xgboost { namespace tree { RowPartitioner::RowPartitioner(int device_idx, size_t num_rows) - : device_idx_(device_idx), - ridx_(num_rows), - ridx_tmp_(num_rows), - d_counts_(kMaxUpdatePositionBatchSize) { + : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) { dh::safe_cuda(cudaSetDevice(device_idx_)); ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)}); thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size()); diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh index e9fb7e86add7..4ba0bd27fe2f 100644 --- a/src/tree/gpu_hist/row_partitioner.cuh +++ b/src/tree/gpu_hist/row_partitioner.cuh @@ -218,7 +218,6 @@ class RowPartitioner { dh::TemporaryArray ridx_; // Staging area for sorting ridx dh::TemporaryArray ridx_tmp_; - dh::TemporaryArray d_counts_; dh::device_vector tmp_; dh::PinnedMemory pinned_; dh::PinnedMemory pinned2_; @@ -283,13 +282,13 @@ class RowPartitioner { // Temporary arrays auto h_counts = pinned_.GetSpan(nidx.size(), 0); + dh::TemporaryArray d_counts(nidx.size(), 0); // Partition the rows according to the operator SortPositionBatch( - dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_), + dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, &tmp_, stream_); - dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts_.data().get(), - sizeof(decltype(d_counts_)::value_type) * h_counts.size(), + dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(), cudaMemcpyDefault, stream_)); // TODO(Rory): this synchronisation hurts performance a lot // Future optimisation should find a way to skip this @@ -300,7 +299,6 @@ class RowPartitioner { auto segment = ridx_segments_.at(nidx[i]).segment; auto left_count = h_counts[i]; CHECK_LE(left_count, segment.Size()); - CHECK_GE(left_count, 0); ridx_segments_.resize(std::max(static_cast(ridx_segments_.size()), std::max(left_nidx[i], right_nidx[i]) + 1)); ridx_segments_[nidx[i]] = NodePositionInfo{segment, left_nidx[i], right_nidx[i]};