From 2b4cf67cd12672c0c8bca04549f0c65400ec6d0d Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 21 Apr 2022 03:10:55 -0700
Subject: [PATCH 01/64] Remove single_precision_histogram

---
 doc/gpu/index.rst                         |   4 +-
 doc/parameter.rst                         |   2 +-
 include/xgboost/tree_updater.h            |   6 +-
 src/tree/tree_updater.cc                  |   3 +-
 src/tree/updater_approx.cc                |   9 +-
 src/tree/updater_basemaker-inl.h          |   7 +-
 src/tree/updater_colmaker.cc              |   7 +-
 src/tree/updater_gpu_hist.cu              | 162 +++++++---------------
 src/tree/updater_histmaker.cc             |  20 ++-
 src/tree/updater_prune.cc                 |  10 +-
 src/tree/updater_quantile_hist.cc         |   4 +-
 src/tree/updater_quantile_hist.h          |   3 +-
 src/tree/updater_refresh.cc               |  13 +-
 src/tree/updater_sync.cc                  |   9 +-
 tests/cpp/tree/test_gpu_hist.cu           |  16 +--
 tests/python-gpu/test_gpu_basic_models.py |   4 +-
 tests/python-gpu/test_gpu_updaters.py     |   3 +-
 17 files changed, 107 insertions(+), 175 deletions(-)

diff --git a/doc/gpu/index.rst b/doc/gpu/index.rst
index e36fc72a1746..049cf311dff2 100644
--- a/doc/gpu/index.rst
+++ b/doc/gpu/index.rst
@@ -59,13 +59,11 @@ Supported parameters
 +--------------------------------+--------------+
 | ``interaction_constraints``    | |tick|       |
 +--------------------------------+--------------+
-| ``single_precision_histogram`` | |tick|       |
+| ``single_precision_histogram`` | |cross|      |
 +--------------------------------+--------------+
 
 GPU accelerated prediction is enabled by default for the above mentioned ``tree_method`` parameters but can be switched to CPU prediction by setting ``predictor`` to ``cpu_predictor``. This could be useful if you want to conserve GPU memory. Likewise when using CPU algorithms, GPU accelerated prediction can be enabled by setting ``predictor`` to ``gpu_predictor``.
 
-The experimental parameter ``single_precision_histogram`` can be set to True to enable building histograms using single precision. This may improve speed, in particular on older architectures.
-
 The device ordinal (which GPU to use if you have many of them) can be selected using the
 ``gpu_id`` parameter, which defaults to 0 (the first device reported by CUDA runtime).
 
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 781150490082..4392b5bf7680 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -240,7 +240,7 @@ Additional parameters for ``hist``, ``gpu_hist`` and ``approx`` tree method
 
 * ``single_precision_histogram``, [default= ``false``]
 
-  - Use single precision to build histograms instead of double precision.
+  - Use single precision to build histograms instead of double precision. Currently disabled for ``gpu_hist``.
 
 * ``max_cat_to_onehot``
 
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
index 6189221dc0bf..6248a65e270d 100644
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -35,6 +35,7 @@ class TreeUpdater : public Configurable {
   GenericParameter const* ctx_ = nullptr;
 
  public:
+  explicit TreeUpdater(const GenericParameter* ctx) : ctx_(ctx) {}
   /*! \brief virtual destructor */
   ~TreeUpdater() override = default;
   /*!
@@ -91,8 +92,9 @@ class TreeUpdater : public Configurable {
  * \brief Registry entry for tree updater.
  */
 struct TreeUpdaterReg
-    : public dmlc::FunctionRegEntryBase<TreeUpdaterReg,
-                                        std::function<TreeUpdater*(ObjInfo task)> > {};
+    : public dmlc::FunctionRegEntryBase<
+          TreeUpdaterReg,
+          std::function<TreeUpdater*(GenericParameter const* tparam, ObjInfo task)> > {};
 
 /*!
  * \brief Macro to register tree updater.
diff --git a/src/tree/tree_updater.cc b/src/tree/tree_updater.cc
index 05f6c4bb5fd6..ee5659636305 100644
--- a/src/tree/tree_updater.cc
+++ b/src/tree/tree_updater.cc
@@ -20,8 +20,7 @@ TreeUpdater* TreeUpdater::Create(const std::string& name, GenericParameter const
   if (e == nullptr) {
     LOG(FATAL) << "Unknown tree updater " << name;
   }
-  auto p_updater = (e->body)(task);
-  p_updater->ctx_ = tparam;
+  auto p_updater = (e->body)(tparam, task);
   return p_updater;
 }
 
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 3bad6f7da4cc..a06f195374b6 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -252,7 +252,10 @@ class GlobalApproxUpdater : public TreeUpdater {
   ObjInfo task_;
 
  public:
-  explicit GlobalApproxUpdater(ObjInfo task) : task_{task} { monitor_.Init(__func__); }
+  explicit GlobalApproxUpdater(GenericParameter const *ctx, ObjInfo task)
+      : task_{task}, TreeUpdater(ctx) {
+    monitor_.Init(__func__);
+  }
 
   void Configure(const Args &args) override {
     param_.UpdateAllowUnknown(args);
@@ -343,6 +346,8 @@ XGBOOST_REGISTER_TREE_UPDATER(GlobalHistMaker, "grow_histmaker")
     .describe(
         "Tree constructor that uses approximate histogram construction "
         "for each node.")
-    .set_body([](ObjInfo task) { return new GlobalApproxUpdater(task); });
+    .set_body([](GenericParameter const *ctx, ObjInfo task) {
+      return new GlobalApproxUpdater(ctx, task);
+    });
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/updater_basemaker-inl.h b/src/tree/updater_basemaker-inl.h
index da239b2090c7..7fc44a6d15fb 100644
--- a/src/tree/updater_basemaker-inl.h
+++ b/src/tree/updater_basemaker-inl.h
@@ -33,11 +33,10 @@ namespace tree {
  * \brief base tree maker class that defines common operation
  *  needed in tree making
  */
-class BaseMaker: public TreeUpdater {
+class BaseMaker : public TreeUpdater {
  public:
-  void Configure(const Args& args) override {
-    param_.UpdateAllowUnknown(args);
-  }
+  explicit BaseMaker(GenericParameter const *ctx) : TreeUpdater(ctx) {}
+  void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
 
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index e3d716f2cba8..f4279a0a1c3b 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -57,7 +57,8 @@ DMLC_REGISTER_PARAMETER(ColMakerTrainParam);
 /*! \brief column-wise update to construct a tree */
 class ColMaker: public TreeUpdater {
  public:
-  void Configure(const Args& args) override {
+  explicit ColMaker(GenericParameter const *ctx) : TreeUpdater(ctx) {}
+  void Configure(const Args &args) override {
     param_.UpdateAllowUnknown(args);
     colmaker_param_.UpdateAllowUnknown(args);
   }
@@ -614,8 +615,8 @@ class ColMaker: public TreeUpdater {
 
 XGBOOST_REGISTER_TREE_UPDATER(ColMaker, "grow_colmaker")
 .describe("Grow tree with parallelization over columns.")
-.set_body([](ObjInfo) {
-    return new ColMaker();
+.set_body([](GenericParameter const* ctx, ObjInfo) {
+    return new ColMaker(ctx);
   });
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index cb7dd9b7e8e4..2cac6b6c4f4a 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -45,12 +45,9 @@ DMLC_REGISTRY_FILE_TAG(updater_gpu_hist);
 // training parameters specific to this algorithm
 struct GPUHistMakerTrainParam
     : public XGBoostParameter<GPUHistMakerTrainParam> {
-  bool single_precision_histogram;
   bool debug_synchronize;
   // declare parameters
   DMLC_DECLARE_PARAMETER(GPUHistMakerTrainParam) {
-    DMLC_DECLARE_FIELD(single_precision_histogram).set_default(false).describe(
-        "Use single precision to build histograms.");
     DMLC_DECLARE_FIELD(debug_synchronize).set_default(false).describe(
         "Check if all distributed tree are identical after tree construction.");
   }
@@ -532,6 +529,13 @@ struct GPUHistMakerDevice {
 
   void ApplySplit(const GPUExpandEntry& candidate, RegTree* p_tree) {
     RegTree& tree = *p_tree;
+
+    // Sanity check - have we created a leaf with no training instances?
+    if (!rabit::IsDistributed()) {
+      CHECK(row_partitioner->GetRows(candidate.nid).size() > 0)
+          << "No training instances in this leaf!";
+    }
+
     auto parent_sum = candidate.split.left_sum + candidate.split.right_sum;
     auto base_weight = candidate.base_weight;
     auto left_weight = candidate.left_weight * param.learning_rate;
@@ -676,20 +680,35 @@ struct GPUHistMakerDevice {
   }
 };
 
-template <typename GradientSumT>
-class GPUHistMakerSpecialised {
+class GPUHistMaker : public TreeUpdater {
+  using GradientSumT = GradientPairPrecise;
+
  public:
-  explicit GPUHistMakerSpecialised(ObjInfo task) : task_{task} {};
-  void Configure(const Args& args, GenericParameter const* generic_param) {
+  explicit GPUHistMaker(GenericParameter const* ctx, ObjInfo task)
+      : TreeUpdater(ctx), task_{task} {};
+  void Configure(const Args& args) {
+    // Used in test to count how many configurations are performed
+    LOG(DEBUG) << "[GPU Hist]: Configure";
     param_.UpdateAllowUnknown(args);
-    generic_param_ = generic_param;
     hist_maker_param_.UpdateAllowUnknown(args);
     dh::CheckComputeCapability();
 
     monitor_.Init("updater_gpu_hist");
   }
 
-  ~GPUHistMakerSpecialised() {  // NOLINT
+  void LoadConfig(Json const& in) override {
+    auto const& config = get<Object const>(in);
+    FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
+    initialised_ = false;
+    FromJson(config.at("train_param"), &param_);
+  }
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
+    out["train_param"] = ToJson(param_);
+  }
+
+  ~GPUHistMaker() {  // NOLINT
     dh::GlobalMemoryLogger().Log();
   }
 
@@ -719,30 +738,24 @@ class GPUHistMakerSpecialised {
   }
 
   void InitDataOnce(DMatrix* dmat) {
-    device_ = generic_param_->gpu_id;
-    CHECK_GE(device_, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
     info_ = &dmat->Info();
-    reducer_.Init({device_});  // NOLINT
+    reducer_.Init({ctx_->gpu_id});  // NOLINT
 
     // Synchronise the column sampling seed
     uint32_t column_sampling_seed = common::GlobalRandom()();
     rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
     BatchParam batch_param{
-      device_,
-      param_.max_bin,
+        ctx_->gpu_id,
+        param_.max_bin,
     };
     auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
-    dh::safe_cuda(cudaSetDevice(device_));
-    info_->feature_types.SetDevice(device_);
-    maker.reset(new GPUHistMakerDevice<GradientSumT>(device_,
-                                                     page,
-                                                     info_->feature_types.ConstDeviceSpan(),
-                                                     info_->num_row_,
-                                                     param_,
-                                                     column_sampling_seed,
-                                                     info_->num_col_,
-                                                     batch_param));
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    info_->feature_types.SetDevice(ctx_->gpu_id);
+    maker.reset(new GPUHistMakerDevice<GradientSumT>(
+        ctx_->gpu_id, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, param_,
+        column_sampling_seed, info_->num_col_, batch_param));
 
     p_last_fmat_ = dmat;
     initialised_ = true;
@@ -766,7 +779,7 @@ class GPUHistMakerSpecialised {
     }
     fs.Seek(0);
     rabit::Broadcast(&s_model, 0);
-    RegTree reference_tree {};  // rank 0 tree
+    RegTree reference_tree{};  // rank 0 tree
     reference_tree.Load(&fs);
     CHECK(*local_tree == reference_tree);
   }
@@ -775,13 +788,11 @@ class GPUHistMakerSpecialised {
     monitor_.Start("InitData");
     this->InitData(p_fmat);
     monitor_.Stop("InitData");
-
-    gpair->SetDevice(device_);
+    gpair->SetDevice(ctx_->gpu_id);
     maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_);
   }
 
-  bool UpdatePredictionCache(const DMatrix *data,
-                             linalg::VectorView<bst_float> p_out_preds) {
+  bool UpdatePredictionCache(const DMatrix* data, linalg::VectorView<bst_float> p_out_preds) {
     if (maker == nullptr || p_last_fmat_ == nullptr || p_last_fmat_ != data) {
       return false;
     }
@@ -791,107 +802,32 @@ class GPUHistMakerSpecialised {
     return true;
   }
 
-  TrainParam param_;   // NOLINT
-  MetaInfo* info_{};   // NOLINT
+  TrainParam param_;  // NOLINT
+  MetaInfo* info_{};  // NOLINT
 
   std::unique_ptr<GPUHistMakerDevice<GradientSumT>> maker;  // NOLINT
 
+  char const* Name() const override { return "grow_gpu_hist"; }
+
  private:
-  bool initialised_ { false };
+  bool initialised_{false};
 
   GPUHistMakerTrainParam hist_maker_param_;
-  GenericParameter const* generic_param_;
 
   dh::AllReducer reducer_;
 
-  DMatrix* p_last_fmat_ { nullptr };
-  int device_{-1};
+  DMatrix* p_last_fmat_{nullptr};
   ObjInfo task_;
 
   common::Monitor monitor_;
 };
 
-class GPUHistMaker : public TreeUpdater {
- public:
-  explicit GPUHistMaker(ObjInfo task) : task_{task} {}
-  void Configure(const Args& args) override {
-    // Used in test to count how many configurations are performed
-    LOG(DEBUG) << "[GPU Hist]: Configure";
-    hist_maker_param_.UpdateAllowUnknown(args);
-    // The passed in args can be empty, if we simply purge the old maker without
-    // preserving parameters then we can't do Update on it.
-    TrainParam param;
-    if (float_maker_) {
-      param = float_maker_->param_;
-    } else if (double_maker_) {
-      param = double_maker_->param_;
-    }
-    if (hist_maker_param_.single_precision_histogram) {
-      float_maker_.reset(new GPUHistMakerSpecialised<GradientPair>(task_));
-      float_maker_->param_ = param;
-      float_maker_->Configure(args, ctx_);
-    } else {
-      double_maker_.reset(new GPUHistMakerSpecialised<GradientPairPrecise>(task_));
-      double_maker_->param_ = param;
-      double_maker_->Configure(args, ctx_);
-    }
-  }
-
-  void LoadConfig(Json const& in) override {
-    auto const& config = get<Object const>(in);
-    FromJson(config.at("gpu_hist_train_param"), &this->hist_maker_param_);
-    if (hist_maker_param_.single_precision_histogram) {
-      float_maker_.reset(new GPUHistMakerSpecialised<GradientPair>(task_));
-      FromJson(config.at("train_param"), &float_maker_->param_);
-    } else {
-      double_maker_.reset(new GPUHistMakerSpecialised<GradientPairPrecise>(task_));
-      FromJson(config.at("train_param"), &double_maker_->param_);
-    }
-  }
-  void SaveConfig(Json* p_out) const override {
-    auto& out = *p_out;
-    out["gpu_hist_train_param"] = ToJson(hist_maker_param_);
-    if (hist_maker_param_.single_precision_histogram) {
-      out["train_param"] = ToJson(float_maker_->param_);
-    } else {
-      out["train_param"] = ToJson(double_maker_->param_);
-    }
-  }
-
-  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
-              const std::vector<RegTree*>& trees) override {
-    if (hist_maker_param_.single_precision_histogram) {
-      float_maker_->Update(gpair, dmat, trees);
-    } else {
-      double_maker_->Update(gpair, dmat, trees);
-    }
-  }
-
-  bool
-  UpdatePredictionCache(const DMatrix *data,
-                        linalg::VectorView<bst_float> p_out_preds) override {
-    if (hist_maker_param_.single_precision_histogram) {
-      return float_maker_->UpdatePredictionCache(data, p_out_preds);
-    } else {
-      return double_maker_->UpdatePredictionCache(data, p_out_preds);
-    }
-  }
-
-  char const* Name() const override {
-    return "grow_gpu_hist";
-  }
-
- private:
-  GPUHistMakerTrainParam hist_maker_param_;
-  ObjInfo task_;
-  std::unique_ptr<GPUHistMakerSpecialised<GradientPair>> float_maker_;
-  std::unique_ptr<GPUHistMakerSpecialised<GradientPairPrecise>> double_maker_;
-};
-
 #if !defined(GTEST_TEST)
 XGBOOST_REGISTER_TREE_UPDATER(GPUHistMaker, "grow_gpu_hist")
     .describe("Grow tree with GPU.")
-    .set_body([](ObjInfo task) { return new GPUHistMaker(task); });
+    .set_body([](GenericParameter const* tparam, ObjInfo task) {
+      return new GPUHistMaker(tparam, task);
+    });
 #endif  // !defined(GTEST_TEST)
 
 }  // namespace tree
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index 0a85d2d73832..9d36e4d16c0a 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -24,9 +24,9 @@ DMLC_REGISTRY_FILE_TAG(updater_histmaker);
 
 class HistMaker: public BaseMaker {
  public:
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix *p_fmat,
-              const std::vector<RegTree*> &trees) override {
+  explicit HistMaker(GenericParameter const *ctx) : BaseMaker(ctx) {}
+  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+              const std::vector<RegTree *> &trees) override {
     interaction_constraints_.Configure(param_, p_fmat->Info().num_col_);
     // rescale learning rate according to size of trees
     float lr = param_.learning_rate;
@@ -262,12 +262,10 @@ class HistMaker: public BaseMaker {
   }
 };
 
-class CQHistMaker: public HistMaker {
+class CQHistMaker : public HistMaker {
  public:
-  CQHistMaker()  = default;
-  char const* Name() const override {
-    return "grow_local_histmaker";
-  }
+  explicit CQHistMaker(GenericParameter const *ctx) : HistMaker(ctx) {}
+  char const *Name() const override { return "grow_local_histmaker"; }
 
  protected:
   struct HistEntry {
@@ -624,9 +622,7 @@ class CQHistMaker: public HistMaker {
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(LocalHistMaker, "grow_local_histmaker")
-.describe("Tree constructor that uses approximate histogram construction.")
-.set_body([](ObjInfo) {
-    return new CQHistMaker();
-  });
+    .describe("Tree constructor that uses approximate histogram construction.")
+    .set_body([](GenericParameter const *ctx, ObjInfo) { return new CQHistMaker(ctx); });
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index f71f1c698cb9..9e6fad883040 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -21,9 +21,9 @@ namespace tree {
 DMLC_REGISTRY_FILE_TAG(updater_prune);
 
 /*! \brief pruner that prunes a tree after growing finishes */
-class TreePruner: public TreeUpdater {
+class TreePruner : public TreeUpdater {
  public:
-  explicit TreePruner(ObjInfo task) {
+  explicit TreePruner(GenericParameter const* ctx, ObjInfo task) : TreeUpdater(ctx) {
     syncher_.reset(TreeUpdater::Create("sync", ctx_, task));
     pruner_monitor_.Init("TreePruner");
   }
@@ -112,9 +112,7 @@ class TreePruner: public TreeUpdater {
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(TreePruner, "prune")
-.describe("Pruner that prune the tree according to statistics.")
-.set_body([](ObjInfo task) {
-    return new TreePruner(task);
-  });
+    .describe("Pruner that prune the tree according to statistics.")
+    .set_body([](GenericParameter const* ctx, ObjInfo task) { return new TreePruner(ctx, task); });
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 0e1b6db47691..dcbb3dbfba3e 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -390,6 +390,8 @@ template struct QuantileHistMaker::Builder<double>;
 
 XGBOOST_REGISTER_TREE_UPDATER(QuantileHistMaker, "grow_quantile_histmaker")
     .describe("Grow tree using quantized histogram.")
-    .set_body([](ObjInfo task) { return new QuantileHistMaker(task); });
+    .set_body([](GenericParameter const *ctx, ObjInfo task) {
+      return new QuantileHistMaker(ctx, task);
+    });
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
index 3c03a371ebfb..463c7a54ab39 100644
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -225,7 +225,8 @@ inline BatchParam HistBatch(TrainParam const& param) {
 /*! \brief construct a tree using quantized feature values */
 class QuantileHistMaker: public TreeUpdater {
  public:
-  explicit QuantileHistMaker(ObjInfo task) : task_{task} {}
+  explicit QuantileHistMaker(GenericParameter const* ctx, ObjInfo task)
+      : task_{task}, TreeUpdater(ctx) {}
   void Configure(const Args& args) override;
 
   void Update(HostDeviceVector<GradientPair>* gpair,
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index d17c1e1444f7..6110e964f891 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -22,11 +22,10 @@ namespace tree {
 DMLC_REGISTRY_FILE_TAG(updater_refresh);
 
 /*! \brief pruner that prunes a tree after growing finishs */
-class TreeRefresher: public TreeUpdater {
+class TreeRefresher : public TreeUpdater {
  public:
-  void Configure(const Args& args) override {
-    param_.UpdateAllowUnknown(args);
-  }
+  explicit TreeRefresher(GenericParameter const *ctx) : TreeUpdater(ctx) {}
+  void Configure(const Args &args) override { param_.UpdateAllowUnknown(args); }
   void LoadConfig(Json const& in) override {
     auto const& config = get<Object const>(in);
     FromJson(config.at("train_param"), &this->param_);
@@ -160,9 +159,7 @@ class TreeRefresher: public TreeUpdater {
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(TreeRefresher, "refresh")
-.describe("Refresher that refreshes the weight and statistics according to data.")
-.set_body([](ObjInfo) {
-    return new TreeRefresher();
-  });
+    .describe("Refresher that refreshes the weight and statistics according to data.")
+    .set_body([](GenericParameter const *ctx, ObjInfo) { return new TreeRefresher(ctx); });
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index 4f7c7a1a85a6..5a22675965dc 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -20,8 +20,9 @@ DMLC_REGISTRY_FILE_TAG(updater_sync);
  * \brief syncher that synchronize the tree in all distributed nodes
  * can implement various strategies, so far it is always set to node 0's tree
  */
-class TreeSyncher: public TreeUpdater {
+class TreeSyncher : public TreeUpdater {
  public:
+  explicit TreeSyncher(GenericParameter const* tparam) : TreeUpdater(tparam) {}
   void Configure(const Args&) override {}
 
   void LoadConfig(Json const&) override {}
@@ -52,9 +53,7 @@ class TreeSyncher: public TreeUpdater {
 };
 
 XGBOOST_REGISTER_TREE_UPDATER(TreeSyncher, "sync")
-.describe("Syncher that synchronize the tree in all distributed nodes.")
-.set_body([](ObjInfo) {
-    return new TreeSyncher();
-  });
+    .describe("Syncher that synchronize the tree in all distributed nodes.")
+    .set_body([](GenericParameter const* tparam, ObjInfo) { return new TreeSyncher(tparam); });
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 82f40465deb2..883537863307 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -275,8 +275,10 @@ void TestHistogramIndexImpl() {
   int constexpr kNRows = 1000, kNCols = 10;
 
   // Build 2 matrices and build a histogram maker with that
-  tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker{ObjInfo{ObjInfo::kRegression}},
-      hist_maker_ext{ObjInfo{ObjInfo::kRegression}};
+  
+  GenericParameter generic_param(CreateEmptyGenericParam(0));
+  tree::GPUHistMaker hist_maker{&generic_param,ObjInfo{ObjInfo::kRegression}},
+      hist_maker_ext{&generic_param,ObjInfo{ObjInfo::kRegression}};
   std::unique_ptr<DMatrix> hist_maker_dmat(
     CreateSparsePageDMatrixWithRC(kNRows, kNCols, 0, true));
 
@@ -289,10 +291,9 @@ void TestHistogramIndexImpl() {
     {"max_leaves", "0"}
   };
 
-  GenericParameter generic_param(CreateEmptyGenericParam(0));
-  hist_maker.Configure(training_params, &generic_param);
+  hist_maker.Configure(training_params);
   hist_maker.InitDataOnce(hist_maker_dmat.get());
-  hist_maker_ext.Configure(training_params, &generic_param);
+  hist_maker_ext.Configure(training_params);
   hist_maker_ext.InitDataOnce(hist_maker_ext_dmat.get());
 
   // Extract the device maker from the histogram makers and from that its compressed
@@ -344,10 +345,9 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
       {"sampling_method", sampling_method},
   };
 
-  tree::GPUHistMakerSpecialised<GradientPairPrecise> hist_maker{ObjInfo{ObjInfo::kRegression}};
   GenericParameter generic_param(CreateEmptyGenericParam(0));
-  hist_maker.Configure(args, &generic_param);
-
+  tree::GPUHistMaker hist_maker{&generic_param,ObjInfo{ObjInfo::kRegression}};
+  hist_maker.Configure(args);
   hist_maker.Update(gpair, dmat, {tree});
   auto cache = linalg::VectorView<float>{preds->DeviceSpan(), {preds->Size()}, 0};
   hist_maker.UpdatePredictionCache(dmat, cache);
diff --git a/tests/python-gpu/test_gpu_basic_models.py b/tests/python-gpu/test_gpu_basic_models.py
index 06e63bdd56d9..9e955eac2931 100644
--- a/tests/python-gpu/test_gpu_basic_models.py
+++ b/tests/python-gpu/test_gpu_basic_models.py
@@ -16,11 +16,11 @@ class TestGPUBasicModels:
     cpu_test_bm = test_bm.TestModels()
 
     def run_cls(self, X, y):
-        cls = xgb.XGBClassifier(tree_method='gpu_hist', single_precision_histogram=True)
+        cls = xgb.XGBClassifier(tree_method='gpu_hist')
         cls.fit(X, y)
         cls.get_booster().save_model('test_deterministic_gpu_hist-0.json')
 
-        cls = xgb.XGBClassifier(tree_method='gpu_hist', single_precision_histogram=True)
+        cls = xgb.XGBClassifier(tree_method='gpu_hist')
         cls.fit(X, y)
         cls.get_booster().save_model('test_deterministic_gpu_hist-1.json')
 
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index a3427b566360..8f3cbcaac61f 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -3,7 +3,7 @@
 import gc
 import pytest
 import xgboost as xgb
-from hypothesis import given, strategies, assume, settings, note
+from hypothesis import given, strategies, assume, settings, note, reproduce_failure
 
 sys.path.append("tests/python")
 import testing as tm
@@ -15,7 +15,6 @@
     'max_leaves': strategies.integers(0, 256),
     'max_bin': strategies.integers(2, 1024),
     'grow_policy': strategies.sampled_from(['lossguide', 'depthwise']),
-    'single_precision_histogram': strategies.booleans(),
     'min_child_weight': strategies.floats(0.5, 2.0),
     'seed': strategies.integers(0, 10),
     # We cannot enable subsampling as the training loss can increase

From f140ebcb2f0219486ad8702eaa322e0f9da624ea Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 25 Apr 2022 04:27:15 -0700
Subject: [PATCH 02/64] Batch nodes from driver

---
 src/tree/driver.h                      | 33 +++++++++---
 src/tree/updater_approx.cc             |  2 +-
 src/tree/updater_gpu_hist.cu           | 69 +++++++++++++-------------
 src/tree/updater_quantile_hist.cc      |  2 +-
 tests/cpp/tree/gpu_hist/test_driver.cu | 33 +++++++-----
 5 files changed, 84 insertions(+), 55 deletions(-)

diff --git a/src/tree/driver.h b/src/tree/driver.h
index abb8afadcb8a..1e40cc32622f 100644
--- a/src/tree/driver.h
+++ b/src/tree/driver.h
@@ -33,9 +33,9 @@ class Driver {
                           std::function<bool(ExpandEntryT, ExpandEntryT)>>;
 
  public:
-  explicit Driver(TrainParam::TreeGrowPolicy policy)
-      : policy_(policy),
-        queue_(policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT> :
+  explicit Driver(TrainParam param)
+      : param_(param),
+        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT> :
                                                   LossGuide<ExpandEntryT>) {}
   template <typename EntryIterT>
   void Push(EntryIterT begin, EntryIterT end) {
@@ -55,16 +55,30 @@ class Driver {
     return queue_.empty();
   }
 
+  // Can a child of this entry still be expanded?
+  // can be used to avoid extra work
+  bool IsChildValid(ExpandEntryT const& parent_entry){
+    if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
+    if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
+    return true;
+  }
+
   // Return the set of nodes to be expanded
   // This set has no dependencies between entries so they may be expanded in
   // parallel or asynchronously
   std::vector<ExpandEntryT> Pop() {
     if (queue_.empty()) return {};
     // Return a single entry for loss guided mode
-    if (policy_ == TrainParam::kLossGuide) {
+    if (param_.grow_policy == TrainParam::kLossGuide) {
       ExpandEntryT e = queue_.top();
       queue_.pop();
-      return {e};
+
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        return {e};
+      } else {
+        return {};
+      }
     }
     // Return nodes on same level for depth wise
     std::vector<ExpandEntryT> result;
@@ -72,7 +86,11 @@ class Driver {
     int level = e.depth;
     while (e.depth == level && !queue_.empty()) {
       queue_.pop();
-      result.emplace_back(e);
+      if (e.IsValid(param_, num_leaves_)) {
+        num_leaves_++;
+        result.emplace_back(e);
+      }
+
       if (!queue_.empty()) {
         e = queue_.top();
       }
@@ -81,7 +99,8 @@ class Driver {
   }
 
  private:
-  TrainParam::TreeGrowPolicy policy_;
+  TrainParam param_;
+  std::size_t num_leaves_=1;
   ExpandQueue queue_;
 };
 }  // namespace tree
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index a06f195374b6..1c6b195ab34b 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -169,7 +169,7 @@ class GloablApproxBuilder {
     p_last_tree_ = p_tree;
     this->InitData(p_fmat, hess);
 
-    Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
+    Driver<CPUExpandEntry> driver(param_);
     auto &tree = *p_tree;
     driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)});
     bst_node_t num_leaves{1};
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 2cac6b6c4f4a..2340687983a8 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -531,7 +531,7 @@ struct GPUHistMakerDevice {
     RegTree& tree = *p_tree;
 
     // Sanity check - have we created a leaf with no training instances?
-    if (!rabit::IsDistributed()) {
+    if (!rabit::IsDistributed() && row_partitioner) {
       CHECK(row_partitioner->GetRows(candidate.nid).size() > 0)
           << "No training instances in this leaf!";
     }
@@ -616,7 +616,7 @@ struct GPUHistMakerDevice {
   void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo task,
                   RegTree* p_tree, dh::AllReducer* reducer) {
     auto& tree = *p_tree;
-    Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
+    Driver<GPUExpandEntry> driver(param);
 
     monitor.Start("Reset");
     this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_, task);
@@ -626,48 +626,49 @@ struct GPUHistMakerDevice {
     driver.Push({ this->InitRoot(p_tree, task, reducer) });
     monitor.Stop("InitRoot");
 
-    auto num_leaves = 1;
-
     // The set of leaves that can be expanded asynchronously
     auto expand_set = driver.Pop();
     while (!expand_set.empty()) {
+      for(auto & candidate: expand_set){
+        this->ApplySplit(candidate, p_tree);
+      }
+      // Get the candidates we are allowed to expand further
+      // e.g. We do not bother further processing nodes whose children are beyond max depth
+      std::vector<GPUExpandEntry> filtered_expand_set;
+      std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
+                   [&](const auto& e) { return driver.IsChildValid(e); });
+
       auto new_candidates =
-          pinned.GetSpan<GPUExpandEntry>(expand_set.size() * 2, GPUExpandEntry());
+          pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
+
+      for(const auto &e:filtered_expand_set){
+        monitor.Start("UpdatePosition");
+        // Update position is only run when child is valid, instead of right after apply
+        // split (as in approx tree method).  Hense we have the finalise position call
+        // in GPU Hist.
+        this->UpdatePosition(e.nid, p_tree);
+        monitor.Stop("UpdatePosition");
+      }
 
-      for (auto i = 0ull; i < expand_set.size(); i++) {
+      for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
         auto candidate = expand_set.at(i);
-        if (!candidate.IsValid(param, num_leaves)) {
-          continue;
-        }
-        this->ApplySplit(candidate, p_tree);
+        int left_child_nidx = tree[candidate.nid].LeftChild();
+        int right_child_nidx = tree[candidate.nid].RightChild();
 
-        num_leaves++;
+        monitor.Start("BuildHist");
+        this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
+        monitor.Stop("BuildHist");
+      }
 
+      for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
+        auto candidate = expand_set.at(i);
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
-        // Only create child entries if needed
-        if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
-                                         num_leaves)) {
-          monitor.Start("UpdatePosition");
-          // Update position is only run when child is valid, instead of right after apply
-          // split (as in approx tree method).  Hense we have the finalise position call
-          // in GPU Hist.
-          this->UpdatePosition(candidate.nid, p_tree);
-          monitor.Stop("UpdatePosition");
-
-          monitor.Start("BuildHist");
-          this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
-          monitor.Stop("BuildHist");
-
-          monitor.Start("EvaluateSplits");
-          this->EvaluateLeftRightSplits(candidate, task, left_child_nidx, right_child_nidx, *p_tree,
-                                        new_candidates.subspan(i * 2, 2));
-          monitor.Stop("EvaluateSplits");
-        } else {
-          // Set default
-          new_candidates[i * 2] = GPUExpandEntry();
-          new_candidates[i * 2 + 1] = GPUExpandEntry();
-        }
+
+        monitor.Start("EvaluateSplits");
+        this->EvaluateLeftRightSplits(candidate, task, left_child_nidx, right_child_nidx, *p_tree,
+                                      new_candidates.subspan(i * 2, 2));
+        monitor.Stop("EvaluateSplits");
       }
       dh::DefaultStream().Sync();
       driver.Push(new_candidates.begin(), new_candidates.end());
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index dcbb3dbfba3e..bdda543d75a7 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -174,7 +174,7 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
     DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
   monitor_->Start(__func__);
 
-  Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
+  Driver<CPUExpandEntry> driver(param_);
   driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
   bst_node_t num_leaves{1};
   auto expand_set = driver.Pop();
diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu
index d35f3510f628..d7f8cc63869e 100644
--- a/tests/cpp/tree/gpu_hist/test_driver.cu
+++ b/tests/cpp/tree/gpu_hist/test_driver.cu
@@ -6,16 +6,21 @@ namespace xgboost {
 namespace tree {
 
 TEST(GpuHist, DriverDepthWise) {
-  Driver<GPUExpandEntry> driver(TrainParam::kDepthWise);
+  TrainParam p;
+  p.InitAllowUnknown(Args{});
+  p.grow_policy=TrainParam::kDepthWise;
+  Driver<GPUExpandEntry> driver(p);
   EXPECT_TRUE(driver.Pop().empty());
   DeviceSplitCandidate split;
   split.loss_chg = 1.0f;
-  GPUExpandEntry root(0, 0, split, .0f, .0f, .0f);
+  split.left_sum = {0.0f, 1.0f};
+  split.right_sum = {0.0f, 1.0f};
+  GPUExpandEntry root(0, 0, split, 2.0f, 1.0f, 1.0f);
   driver.Push({root});
   EXPECT_EQ(driver.Pop().front().nid, 0);
-  driver.Push({GPUExpandEntry{1, 1, split, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{2, 1, split, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{3, 2, split, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{3, 2, split, 2.0f, 1.0f, 1.0f}});
   // Should return entries from level 1
   auto res = driver.Pop();
   EXPECT_EQ(res.size(), 2);
@@ -29,18 +34,22 @@ TEST(GpuHist, DriverDepthWise) {
 
 TEST(GpuHist, DriverLossGuided) {
   DeviceSplitCandidate high_gain;
+  high_gain.left_sum = {0.0f, 1.0f};
+  high_gain.right_sum = {0.0f, 1.0f};
   high_gain.loss_chg = 5.0f;
-  DeviceSplitCandidate low_gain;
+  DeviceSplitCandidate low_gain = high_gain;
   low_gain.loss_chg = 1.0f;
 
-  Driver<GPUExpandEntry> driver(TrainParam::kLossGuide);
+  TrainParam p;
+  p.grow_policy=TrainParam::kLossGuide;
+  Driver<GPUExpandEntry> driver(p);
   EXPECT_TRUE(driver.Pop().empty());
-  GPUExpandEntry root(0, 0, high_gain, .0f, .0f, .0f);
+  GPUExpandEntry root(0, 0, high_gain, 2.0f, 1.0f, 1.0f );
   driver.Push({root});
   EXPECT_EQ(driver.Pop().front().nid, 0);
   // Select high gain first
-  driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{2, 2, high_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{2, 2, high_gain, 2.0f, 1.0f, 1.0f}});
   auto res = driver.Pop();
   EXPECT_EQ(res.size(), 1);
   EXPECT_EQ(res[0].nid, 2);
@@ -49,8 +58,8 @@ TEST(GpuHist, DriverLossGuided) {
   EXPECT_EQ(res[0].nid, 1);
 
   // If equal gain, use nid
-  driver.Push({GPUExpandEntry{2, 1, low_gain, .0f, .0f, .0f}});
-  driver.Push({GPUExpandEntry{1, 1, low_gain, .0f, .0f, .0f}});
+  driver.Push({GPUExpandEntry{2, 1, low_gain, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{1, 1, low_gain, 2.0f, 1.0f, 1.0f}});
   res = driver.Pop();
   EXPECT_EQ(res[0].nid, 1);
   res = driver.Pop();

From 80a3e78f9e1dcbf2a78f6572897453c61afd60b0 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 29 Apr 2022 02:22:26 -0700
Subject: [PATCH 03/64] Categoricals broken

---
 src/tree/gpu_hist/evaluate_splits.cuh     |   2 +-
 src/tree/gpu_hist/evaluator.cu            |  73 +++----
 src/tree/gpu_hist/histogram.cu            |   9 -
 src/tree/updater_gpu_hist.cu              | 223 +++++++++++++---------
 tests/cpp/tree/gpu_hist/test_histogram.cu |   1 -
 tests/cpp/tree/test_gpu_hist.cu           |  45 +++--
 6 files changed, 186 insertions(+), 167 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index b03fd7b41b51..7d792051e5be 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -92,7 +92,7 @@ class GPUHistEvaluator {
   }
 
   /**
-   * \brief Get sorted index storage based on the left node of inputs .
+   * \brief Get sorted index storage based on the left node of inputs.
    */
   auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
     if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index bc2027489131..381ef8fbb349 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -21,55 +21,36 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
                                            int32_t device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
-  if (cuts.HasCategorical() && !task.UseOneHot()) {
+  if (cuts.HasCategorical()) {
     dh::XGBCachingDeviceAllocator<char> alloc;
-    auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
-    auto beg = thrust::make_counting_iterator<size_t>(1ul);
-    auto end = thrust::make_counting_iterator<size_t>(ptrs.size());
-    auto to_onehot = param.max_cat_to_onehot;
-    // This condition avoids sort-based split function calls if the users want
-    // onehot-encoding-based splits.
-    // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
-    has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
-      auto idx = i - 1;
-      if (common::IsCat(ft, idx)) {
-        auto n_bins = ptrs[i] - ptrs[idx];
-        bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
-        return use_sort;
-      }
-      return false;
-    });
-
-    if (has_sort_) {
-      auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
-      CHECK_NE(bit_storage_size, 0);
-      // We need to allocate for all nodes since the updater can grow the tree layer by
-      // layer, all nodes in the same layer must be preserved until that layer is
-      // finished.  We can allocate one layer at a time, but the best case is reducing the
-      // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
-      // times than necessary.
-      split_cats_.resize(param.MaxNodes() * bit_storage_size);
-      h_split_cats_.resize(split_cats_.size());
-      dh::safe_cuda(
-          cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
+    auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
+    CHECK_NE(bit_storage_size, 0);
+    // We need to allocate for all nodes since the updater can grow the tree layer by
+    // layer, all nodes in the same layer must be preserved until that layer is
+    // finished.  We can allocate one layer at a time, but the best case is reducing the
+    // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
+    // times than necessary.
+    split_cats_.resize(param.MaxNodes() * bit_storage_size);
+    h_split_cats_.resize(split_cats_.size());
+    dh::safe_cuda(
+        cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
 
-      cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
-      sort_input_.resize(cat_sorted_idx_.size());
+    cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
+    sort_input_.resize(cat_sorted_idx_.size());
 
-      /**
-       * cache feature index binary search result
-       */
-      feature_idx_.resize(cat_sorted_idx_.size());
-      auto d_fidxes = dh::ToSpan(feature_idx_);
-      auto it = thrust::make_counting_iterator(0ul);
-      auto values = cuts.cut_values_.ConstDeviceSpan();
-      auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
-      thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(),
-                        feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) {
-                          auto fidx = dh::SegmentId(ptrs, i);
-                          return fidx;
-                        });
-    }
+    /**
+     * cache feature index binary search result
+     */
+    feature_idx_.resize(cat_sorted_idx_.size());
+    auto d_fidxes = dh::ToSpan(feature_idx_);
+    auto it = thrust::make_counting_iterator(0ul);
+    auto values = cuts.cut_values_.ConstDeviceSpan();
+    auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
+    thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
+                      [=] XGBOOST_DEVICE(size_t i) {
+                        auto fidx = dh::SegmentId(ptrs, i);
+                        return fidx;
+                      });
   }
 }
 
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 791363a05cdd..efb08d5e44e2 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -247,15 +247,6 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
   dh::safe_cuda(cudaGetLastError());
 }
 
-template void BuildGradientHistogram<GradientPair>(
-    EllpackDeviceAccessor const& matrix,
-    FeatureGroupsAccessor const& feature_groups,
-    common::Span<GradientPair const> gpair,
-    common::Span<const uint32_t> ridx,
-    common::Span<GradientPair> histogram,
-    HistRounding<GradientPair> rounding,
-    bool force_global_memory);
-
 template void BuildGradientHistogram<GradientPairPrecise>(
     EllpackDeviceAccessor const& matrix,
     FeatureGroupsAccessor const& feature_groups,
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 2340687983a8..2cd9d4babeb1 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -57,7 +57,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
 #endif  // !defined(GTEST_TEST)
 
 /**
- * \struct  DeviceHistogram
+ * \struct  DeviceHistogramStorage
  *
  * \summary Data storage for node histograms on device. Automatically expands.
  *
@@ -67,12 +67,18 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
  * \author  Rory
  * \date    28/07/2018
  */
-template <typename GradientSumT, size_t kStopGrowingSize = 1 << 26>
-class DeviceHistogram {
+template <typename GradientSumT, size_t kStopGrowingSize = 1 << 28>
+class DeviceHistogramStorage {
  private:
   /*! \brief Map nidx to starting index of its histogram. */
   std::map<int, size_t> nidx_map_;
+  // Large buffer of zeroed memory, caches histograms
   dh::device_vector<typename GradientSumT::ValueT> data_;
+  // If we run out of storage allocate one histogram at a time
+  // in overflow. Not cached, overwritten when a new histogram
+  // is requested
+  dh::device_vector<typename GradientSumT::ValueT> overflow_;
+  std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
   int device_id_;
   static constexpr size_t kNumItemsInGradientSum =
@@ -81,6 +87,8 @@ class DeviceHistogram {
                 "Number of items in gradient type should be 2.");
 
  public:
+  // Start with about 16mb
+  DeviceHistogramStorage() { data_.reserve(1 << 22); }
   void Init(int device_id, int n_bins) {
     this->n_bins_ = n_bins;
     this->device_id_ = device_id;
@@ -91,21 +99,53 @@ class DeviceHistogram {
     dh::LaunchN(data_.size(),
                 [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     nidx_map_.clear();
+    overflow_nidx_map_.clear();
   }
   bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend();
+    return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
   }
   int Bins() const {
     return n_bins_;
   }
-  size_t HistogramSize() const {
-    return n_bins_ * kNumItemsInGradientSum;
-  }
+  size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
+  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
 
-  dh::device_vector<typename GradientSumT::ValueT>& Data() {
-    return data_;
+  void AllocateHistograms(const std::vector<int>& new_nidxs) {
+    for (int nidx : new_nidxs) {
+      CHECK(!HistogramExists(nidx));
+    }
+    // Number of items currently used in data
+    const size_t used_size = nidx_map_.size() * HistogramSize();
+    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
+    if (used_size >= kStopGrowingSize) {
+      // Use overflow
+      // Delete previous entries
+      overflow_nidx_map_.clear();
+      overflow_.resize(HistogramSize() * new_nidxs.size());
+      // Zero memory
+      auto d_data = overflow_.data().get();
+      dh::LaunchN(overflow_.size(),
+                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
+      }
+    } else {
+      CHECK_GE(data_.size(), used_size);
+      // Expand if necessary
+      if (data_.size() < new_used_size) {
+        data_.resize(std::max(data_.size() * 2, new_used_size));
+      }
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
+      }
+    }
+
+    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
   }
 
+  /*
   void AllocateHistogram(int nidx) {
     if (HistogramExists(nidx)) return;
     // Number of items currently used in data
@@ -139,6 +179,7 @@ class DeviceHistogram {
 
     CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
   }
+    */
 
   /**
    * \summary   Return pointer to histogram memory for a given node.
@@ -147,9 +188,16 @@ class DeviceHistogram {
    */
   common::Span<GradientSumT> GetNodeHistogram(int nidx) {
     CHECK(this->HistogramExists(nidx));
-    auto ptr = data_.data().get() + nidx_map_.at(nidx);
-    return common::Span<GradientSumT>(
-        reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+
+    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
+      // Fetch from normal cache
+      auto ptr = data_.data().get() + nidx_map_.at(nidx);
+      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+    } else {
+      // Fetch from overflow
+      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
+      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+    }
   }
 };
 
@@ -166,7 +214,7 @@ struct GPUHistMakerDevice {
   BatchParam batch_param;
 
   std::unique_ptr<RowPartitioner> row_partitioner;
-  DeviceHistogram<GradientSumT> hist{};
+  DeviceHistogramStorage<GradientSumT> hist{};
 
   dh::caching_device_vector<GradientPair> d_gpair;  // storage for gpair;
   common::Span<GradientPair> gpair;
@@ -189,8 +237,6 @@ struct GPUHistMakerDevice {
   std::unique_ptr<GradientBasedSampler> sampler;
 
   std::unique_ptr<FeatureGroups> feature_groups;
-  // Storing split categories for last node.
-  dh::caching_device_vector<uint32_t> node_categories;
 
   GPUHistMakerDevice(int _device_id, EllpackPageImpl const* _page,
                      common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
@@ -319,7 +365,6 @@ struct GPUHistMakerDevice {
   }
 
   void BuildHist(int nidx) {
-    hist.AllocateHistogram(nidx);
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
     BuildGradientHistogram(page->GetDeviceAccessor(device_id),
@@ -327,8 +372,12 @@ struct GPUHistMakerDevice {
                            d_ridx, d_node_hist, histogram_rounding);
   }
 
-  void SubtractionTrick(int nidx_parent, int nidx_histogram,
-                        int nidx_subtraction) {
+  // Attempt to the subtraction trick
+  // return true if succeeded
+  bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
+    if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) {
+      return false;
+    }
     auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent);
     auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
     auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
@@ -337,22 +386,18 @@ struct GPUHistMakerDevice {
       d_node_hist_subtraction[idx] =
           d_node_hist_parent[idx] - d_node_hist_histogram[idx];
     });
+    return true;
   }
 
-  bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
-    // Make sure histograms are already allocated
-    hist.AllocateHistogram(nidx_subtraction);
-    return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
-  }
-
-  void UpdatePosition(int nidx, RegTree* p_tree) {
-    RegTree::Node split_node = (*p_tree)[nidx];
-    auto split_type = p_tree->NodeSplitType(nidx);
+  void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
+    RegTree::Node split_node = (*p_tree)[e.nid];
+    auto split_type = p_tree->NodeSplitType(e.nid);
     auto d_matrix = page->GetDeviceAccessor(device_id);
-    auto node_cats = dh::ToSpan(node_categories);
+    auto node_cats = e.split.split_cats.Bits();
+    
 
     row_partitioner->UpdatePosition(
-        nidx, split_node.LeftChild(), split_node.RightChild(),
+        e.nid, split_node.LeftChild(), split_node.RightChild(),
         [=] __device__(bst_uint ridx) {
           // given a row index, returns the node id it belongs to
           bst_float cut_value =
@@ -483,13 +528,15 @@ struct GPUHistMakerDevice {
     row_partitioner.reset();
   }
 
-  void AllReduceHist(int nidx, dh::AllReducer* reducer) {
+  // num histograms is the number of contiguous histograms in memory to reduce over
+  void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
-    reducer->AllReduceSum(
-        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
+    reducer->AllReduceSum(reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+                          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+                          page->Cuts().TotalBins() *
+                              (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) *
+                              num_histograms);
 
     monitor.Stop("AllReduce");
   }
@@ -497,33 +544,49 @@ struct GPUHistMakerDevice {
   /**
    * \brief Build GPU local histograms for the left and right child of some parent node
    */
-  void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left,
-        int nidx_right, dh::AllReducer* reducer) {
-    auto build_hist_nidx = nidx_left;
-    auto subtraction_trick_nidx = nidx_right;
-
-    // Decide whether to build the left histogram or right histogram
-    // Use sum of Hessian as a heuristic to select node with fewest training instances
-    bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess();
-    if (fewer_right) {
-      std::swap(build_hist_nidx, subtraction_trick_nidx);
+  void BuildHistLeftRight(std::vector<GPUExpandEntry>const  &candidates, dh::AllReducer* reducer, const RegTree& tree) {
+    if(candidates.empty()) return;
+    // Some nodes we will manually compute histograms
+    // others we will do by subtraction
+    std::vector<int> hist_nidx;
+    std::vector<int> subtraction_nidx;
+    for (auto& e : candidates) {
+      // Decide whether to build the left histogram or right histogram
+      // Use sum of Hessian as a heuristic to select node with fewest training instances
+      bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess();
+      if (fewer_right) {
+        hist_nidx.emplace_back(tree[e.nid].RightChild());
+        subtraction_nidx.emplace_back(tree[e.nid].LeftChild());
+      } else {
+        hist_nidx.emplace_back(tree[e.nid].LeftChild());
+        subtraction_nidx.emplace_back(tree[e.nid].RightChild());
+      }
+    }
+    std::vector<int> all_new = hist_nidx;
+    all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
+    // Allocate the histograms
+    // Guaranteed contiguous memory
+    hist.AllocateHistograms(all_new);
+
+    for(auto nidx:hist_nidx){
+      this->BuildHist(nidx);
     }
 
-    this->BuildHist(build_hist_nidx);
-    this->AllReduceHist(build_hist_nidx, reducer);
+    // Reduce all in one go
+    // This gives much better latency in a distributed setting
+    // when processing a large batch
+    this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size());
 
-    // Check whether we can use the subtraction trick to calculate the other
-    bool do_subtraction_trick = this->CanDoSubtractionTrick(
-        candidate.nid, build_hist_nidx, subtraction_trick_nidx);
+    for (int i = 0; i < subtraction_nidx.size(); i++) {
+      auto build_hist_nidx = hist_nidx.at(i);
+      auto subtraction_trick_nidx = subtraction_nidx.at(i);
+      auto parent_nidx = candidates.at(i).nid;
 
-    if (do_subtraction_trick) {
-      // Calculate other histogram using subtraction trick
-      this->SubtractionTrick(candidate.nid, build_hist_nidx,
-                             subtraction_trick_nidx);
-    } else {
-      // Calculate other histogram manually
-      this->BuildHist(subtraction_trick_nidx);
-      this->AllReduceHist(subtraction_trick_nidx, reducer);
+      if(!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)){
+        // Calculate other histogram manually
+        this->BuildHist(subtraction_trick_nidx);
+        this->AllReduceHist(subtraction_trick_nidx, reducer, 1);
+      }
     }
   }
 
@@ -546,27 +609,11 @@ struct GPUHistMakerDevice {
       CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
           << "Categorical feature value too large.";
       std::vector<uint32_t> split_cats;
-      if (candidate.split.split_cats.Bits().empty()) {
-        if (common::InvalidCat(candidate.split.fvalue)) {
-          common::InvalidCategory();
-        }
-        auto cat = common::AsCat(candidate.split.fvalue);
-        split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0);
-        common::CatBitField cats_bits(split_cats);
-        cats_bits.Set(cat);
-        dh::CopyToD(split_cats, &node_categories);
-      } else {
-        auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
-        auto max_cat = candidate.split.MaxCat();
-        split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
-        CHECK_LE(split_cats.size(), h_cats.size());
-        std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
-
-        node_categories.resize(candidate.split.split_cats.Bits().size());
-        dh::safe_cuda(cudaMemcpyAsync(
-            node_categories.data().get(), candidate.split.split_cats.Data(),
-            candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice));
-      }
+      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
+      auto max_cat = candidate.split.MaxCat();
+      split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
+      CHECK_LE(split_cats.size(), h_cats.size());
+      std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
 
       tree.ExpandCategorical(
           candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
@@ -598,8 +645,9 @@ struct GPUHistMakerDevice {
                    GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
     rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);
 
+    hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
-    this->AllReduceHist(kRootNIdx, reducer);
+    this->AllReduceHist(kRootNIdx, reducer, 1);
 
     // Remember root stats
     node_sum_gradients[kRootNIdx] = root_sum;
@@ -638,6 +686,7 @@ struct GPUHistMakerDevice {
       std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
                    [&](const auto& e) { return driver.IsChildValid(e); });
 
+
       auto new_candidates =
           pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
 
@@ -646,22 +695,16 @@ struct GPUHistMakerDevice {
         // Update position is only run when child is valid, instead of right after apply
         // split (as in approx tree method).  Hense we have the finalise position call
         // in GPU Hist.
-        this->UpdatePosition(e.nid, p_tree);
+        this->UpdatePosition(e, p_tree);
         monitor.Stop("UpdatePosition");
       }
 
-      for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
-        auto candidate = expand_set.at(i);
-        int left_child_nidx = tree[candidate.nid].LeftChild();
-        int right_child_nidx = tree[candidate.nid].RightChild();
-
-        monitor.Start("BuildHist");
-        this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
-        monitor.Stop("BuildHist");
-      }
+      monitor.Start("BuildHist");
+      this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
+      monitor.Stop("BuildHist");
 
       for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
-        auto candidate = expand_set.at(i);
+        auto candidate = filtered_expand_set.at(i);
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
 
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 3b543a48d7cc..75d97b681a61 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -95,7 +95,6 @@ TEST(Histogram, GPUDeterministic) {
   std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
   for (bool is_dense : is_dense_array) {
     for (int shm_size : shm_sizes) {
-      TestDeterministicHistogram<GradientPair>(is_dense, shm_size);
       TestDeterministicHistogram<GradientPairPrecise>(is_dense, shm_size);
     }
   }
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 883537863307..bdabbbcb38c2 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -29,29 +29,38 @@ TEST(GpuHist, DeviceHistogram) {
   constexpr size_t kNBins = 128;
   constexpr size_t kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogram<GradientPairPrecise, kStopGrowing> histogram;
+  DeviceHistogramStorage<GradientPairPrecise, kStopGrowing> histogram;
   histogram.Init(0, kNBins);
-  for (size_t i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistogram(i);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms({i});
   }
   histogram.Reset();
   ASSERT_EQ(histogram.Data().size(), kStopGrowing);
 
   // Use allocated memory but do not erase nidx_map.
-  for (size_t i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistogram(i);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms({i});
   }
-  for (size_t i = 0; i < kNNodes; ++i) {
+  for (int i = 0; i < kNNodes; ++i) {
     ASSERT_TRUE(histogram.HistogramExists(i));
   }
 
-  // Erase existing nidx_map.
-  for (size_t i = kNNodes; i < kNNodes * 2; ++i) {
-    histogram.AllocateHistogram(i);
-  }
-  for (size_t i = 0; i < kNNodes; ++i) {
-    ASSERT_FALSE(histogram.HistogramExists(i));
+  // Add two new nodes
+  histogram.AllocateHistograms({kNNodes});
+  histogram.AllocateHistograms({kNNodes+1});
+
+  // Old cached nodes should still exist
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
   }
+
+  // Should be deleted
+  ASSERT_FALSE(histogram.HistogramExists({kNNodes}));
+  // Most recent node should exist
+  ASSERT_TRUE(histogram.HistogramExists({kNNodes + 1}));
+
+  // Add same node again - should fail
+  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes+1}););
 }
 
 std::vector<GradientPairPrecise> GetHostHistGpair() {
@@ -95,9 +104,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
   maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
-  maker.hist.AllocateHistogram(0);
+  maker.hist.AllocateHistograms({0});
   maker.gpair = gpair.DeviceSpan();
-  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);;
+  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);
 
   BuildGradientHistogram(
       page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0),
@@ -105,7 +114,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
       maker.hist.GetNodeHistogram(0), maker.histogram_rounding,
       !use_shared_memory_histograms);
 
-  DeviceHistogram<GradientSumT>& d_hist = maker.hist;
+  DeviceHistogramStorage<GradientSumT>& d_hist = maker.hist;
 
   auto node_histogram = d_hist.GetNodeHistogram(0);
   // d_hist.data stored in float, not gradient pair
@@ -128,12 +137,10 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
 TEST(GpuHist, BuildHistGlobalMem) {
   TestBuildHist<GradientPairPrecise>(false);
-  TestBuildHist<GradientPair>(false);
 }
 
 TEST(GpuHist, BuildHistSharedMem) {
   TestBuildHist<GradientPairPrecise>(true);
-  TestBuildHist<GradientPair>(true);
 }
 
 TEST(GpuHist, ApplySplit) {
@@ -173,8 +180,6 @@ TEST(GpuHist, ApplySplit) {
     ASSERT_EQ(tree.GetSplitCategories().size(), 1);
     uint32_t bits = 1u << 30;  // bits: 0, 1, 0, 0, 0, ..., 0
     ASSERT_EQ(tree.GetSplitCategories().back(), bits);
-
-    ASSERT_EQ(updater.node_categories.size(), 1);
   }
 }
 
@@ -238,7 +243,7 @@ TEST(GpuHist, EvaluateRootSplit) {
 
   // Initialize GPUHistMakerDevice::hist
   maker.hist.Init(0, (max_bins - 1) * kNCols);
-  maker.hist.AllocateHistogram(0);
+  maker.hist.AllocateHistograms({0});
   // Each row of hist_gpair represents gpairs for one feature.
   // Each entry represents a bin.
   std::vector<GradientPairPrecise> hist_gpair = GetHostHistGpair();

From e1fb7024fdea6224349a3fbd863ba839b3a78748 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Sun, 1 May 2022 09:50:09 -0700
Subject: [PATCH 04/64] Refactor categoricals

---
 src/tree/gpu_hist/evaluate_splits.cu  | 39 ++++++++-------
 src/tree/gpu_hist/evaluate_splits.cuh |  6 +--
 src/tree/gpu_hist/evaluator.cu        | 72 +++++++++++++--------------
 src/tree/updater_gpu_hist.cu          | 43 +++++-----------
 tests/cpp/tree/test_gpu_hist.cu       |  2 -
 5 files changed, 72 insertions(+), 90 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index ce8b13d0def2..7fba1902b881 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -273,12 +273,19 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a,
  * \brief Set the bits for categorical splits based on the split threshold.
  */
 template <typename GradientSumT>
-__device__ void SortBasedSplit(EvaluateSplitInputs<GradientSumT> const &input,
+__device__ void SetCategoricalSplit(EvaluateSplitInputs<GradientSumT> const &input,
                                common::Span<bst_feature_t const> d_sorted_idx, bst_feature_t fidx,
                                bool is_left, common::Span<common::CatBitField::value_type> out,
-                               DeviceSplitCandidate *p_out_split) {
+                               DeviceSplitCandidate *p_out_split, ObjInfo task) {
   auto &out_split = *p_out_split;
   out_split.split_cats = common::CatBitField{out};
+
+  // Simple case for one hot split
+  if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
+    out_split.split_cats.Set(common::AsCat(out_split.fvalue));
+    return;
+  }
+
   auto node_sorted_idx =
       is_left ? d_sorted_idx.subspan(0, input.feature_values.size())
               : d_sorted_idx.subspan(input.feature_values.size(), input.feature_values.size());
@@ -313,7 +320,7 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
     EvaluateSplitInputs<GradientSumT> left, EvaluateSplitInputs<GradientSumT> right, ObjInfo task,
     TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
     common::Span<DeviceSplitCandidate> out_splits) {
-  if (!split_cats_.empty()) {
+  if (need_sort_histogram_) {
     this->SortHistogram(left, right, evaluator);
   }
 
@@ -354,14 +361,12 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
 template <typename GradientSumT>
 void GPUHistEvaluator<GradientSumT>::CopyToHost(EvaluateSplitInputs<GradientSumT> const &input,
                                                 common::Span<CatST> cats_out) {
-  if (has_sort_) {
-    dh::CUDAEvent event;
-    event.Record(dh::DefaultStream());
-    auto h_cats = this->HostCatStorage(input.nidx);
-    copy_stream_.View().Wait(event);
-    dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
-                                  cudaMemcpyDeviceToHost, copy_stream_.View()));
-  }
+  dh::CUDAEvent event;
+  event.Record(dh::DefaultStream());
+  auto h_cats = this->HostCatStorage(input.nidx);
+  copy_stream_.View().Wait(event);
+  dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
+                                cudaMemcpyDeviceToHost, copy_stream_.View()));
 }
 
 template <typename GradientSumT>
@@ -378,17 +383,16 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, Ob
   auto d_sorted_idx = this->SortedIdx(left);
   auto d_entries = out_entries;
   auto cats_out = this->DeviceCatStorage(left.nidx);
-  // turn candidate into entry, along with hanlding sort based split.
+  // turn candidate into entry, along with handling sort based split.
   dh::LaunchN(right.feature_set.empty() ? 1 : 2, [=] __device__(size_t i) {
     auto const &input = i == 0 ? left : right;
     auto &split = out_splits[i];
     auto fidx = out_splits[i].findex;
 
-    if (split.is_cat &&
-        !common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
+    if (split.is_cat) {
       bool is_left = i == 0;
       auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2);
-      SortBasedSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]);
+      SetCategoricalSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i], task);
     }
 
     float base_weight =
@@ -420,9 +424,8 @@ GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
     auto &split = out_split[i];
     auto fidx = out_split[i].findex;
 
-    if (split.is_cat &&
-        !common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
-      SortBasedSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]);
+    if (split.is_cat) {
+      SetCategoricalSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i], task);
     }
 
     float left_weight = evaluator.CalcWeight(0, input.param, GradStats{split.left_sum});
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index b03fd7b41b51..f28aac97b417 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -58,9 +58,9 @@ class GPUHistEvaluator {
   dh::device_vector<bst_feature_t> feature_idx_;
   // Training param used for evaluation
   TrainParam param_;
-  // whether the input data requires sort based split, which is more complicated so we try
-  // to avoid it if possible.
-  bool has_sort_{false};
+  // Do we have any categorical features that require sorting histograms?
+  // use this to skip the expensive sort step
+  bool need_sort_histogram_ = false;
 
   // Copy the categories from device to host asynchronously.
   void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index bc2027489131..6c081e1ba6df 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -30,46 +30,44 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
     // This condition avoids sort-based split function calls if the users want
     // onehot-encoding-based splits.
     // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
-    has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
-      auto idx = i - 1;
-      if (common::IsCat(ft, idx)) {
-        auto n_bins = ptrs[i] - ptrs[idx];
-        bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
-        return use_sort;
-      }
-      return false;
-    });
+    need_sort_histogram_ =
+        thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
+          auto idx = i - 1;
+          if (common::IsCat(ft, idx)) {
+            auto n_bins = ptrs[i] - ptrs[idx];
+            bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
+            return use_sort;
+          }
+          return false;
+        });
 
-    if (has_sort_) {
-      auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
-      CHECK_NE(bit_storage_size, 0);
-      // We need to allocate for all nodes since the updater can grow the tree layer by
-      // layer, all nodes in the same layer must be preserved until that layer is
-      // finished.  We can allocate one layer at a time, but the best case is reducing the
-      // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
-      // times than necessary.
-      split_cats_.resize(param.MaxNodes() * bit_storage_size);
-      h_split_cats_.resize(split_cats_.size());
-      dh::safe_cuda(
-          cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
+    auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
+    CHECK_NE(bit_storage_size, 0);
+    // We need to allocate for all nodes since the updater can grow the tree layer by
+    // layer, all nodes in the same layer must be preserved until that layer is
+    // finished.  We can allocate one layer at a time, but the best case is reducing the
+    // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
+    // times than necessary.
+    split_cats_.resize(param.MaxNodes() * bit_storage_size);
+    h_split_cats_.resize(split_cats_.size());
+    dh::safe_cuda(
+        cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
 
-      cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
-      sort_input_.resize(cat_sorted_idx_.size());
+    cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
+    sort_input_.resize(cat_sorted_idx_.size());
 
-      /**
-       * cache feature index binary search result
-       */
-      feature_idx_.resize(cat_sorted_idx_.size());
-      auto d_fidxes = dh::ToSpan(feature_idx_);
-      auto it = thrust::make_counting_iterator(0ul);
-      auto values = cuts.cut_values_.ConstDeviceSpan();
-      auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
-      thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(),
-                        feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) {
-                          auto fidx = dh::SegmentId(ptrs, i);
-                          return fidx;
-                        });
-    }
+    /**
+     * cache feature index binary search result
+     */
+    feature_idx_.resize(cat_sorted_idx_.size());
+    auto d_fidxes = dh::ToSpan(feature_idx_);
+    auto it = thrust::make_counting_iterator(0ul);
+    auto values = cuts.cut_values_.ConstDeviceSpan();
+    thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
+                      [=] XGBOOST_DEVICE(size_t i) {
+                        auto fidx = dh::SegmentId(ptrs, i);
+                        return fidx;
+                      });
   }
 }
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 569188fd5374..861b6e15b264 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -197,8 +197,6 @@ struct GPUHistMakerDevice {
   std::unique_ptr<GradientBasedSampler> sampler;
 
   std::unique_ptr<FeatureGroups> feature_groups;
-  // Storing split categories for last node.
-  dh::caching_device_vector<uint32_t> node_categories;
 
   GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
                      common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
@@ -354,14 +352,14 @@ struct GPUHistMakerDevice {
     return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
   }
 
-  void UpdatePosition(int nidx, RegTree* p_tree) {
-    RegTree::Node split_node = (*p_tree)[nidx];
-    auto split_type = p_tree->NodeSplitType(nidx);
+  void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
+    RegTree::Node split_node = (*p_tree)[e.nid];
+    auto split_type = p_tree->NodeSplitType(e.nid);
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
-    auto node_cats = dh::ToSpan(node_categories);
+    auto node_cats = e.split.split_cats.Bits();
 
     row_partitioner->UpdatePosition(
-        nidx, split_node.LeftChild(), split_node.RightChild(),
+        e.nid, split_node.LeftChild(), split_node.RightChild(),
         [=] __device__(bst_uint ridx) {
           // given a row index, returns the node id it belongs to
           bst_float cut_value =
@@ -567,28 +565,13 @@ struct GPUHistMakerDevice {
       CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
           << "Categorical feature value too large.";
       std::vector<uint32_t> split_cats;
-      if (candidate.split.split_cats.Bits().empty()) {
-        if (common::InvalidCat(candidate.split.fvalue)) {
-          common::InvalidCategory();
-        }
-        auto cat = common::AsCat(candidate.split.fvalue);
-        split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0);
-        common::CatBitField cats_bits(split_cats);
-        cats_bits.Set(cat);
-        dh::CopyToD(split_cats, &node_categories);
-      } else {
-        auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
-        auto max_cat = candidate.split.MaxCat();
-        split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
-        CHECK_LE(split_cats.size(), h_cats.size());
-        std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
-
-        node_categories.resize(candidate.split.split_cats.Bits().size());
-        dh::safe_cuda(cudaMemcpyAsync(
-            node_categories.data().get(), candidate.split.split_cats.Data(),
-            candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice));
-      }
-
+      CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
+      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
+      auto max_cat = candidate.split.MaxCat();
+      split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
+      CHECK_LE(split_cats.size(), h_cats.size());
+      std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
+      
       tree.ExpandCategorical(
           candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
           base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(),
@@ -674,7 +657,7 @@ struct GPUHistMakerDevice {
           // Update position is only run when child is valid, instead of right after apply
           // split (as in approx tree method).  Hense we have the finalise position call
           // in GPU Hist.
-          this->UpdatePosition(candidate.nid, p_tree);
+          this->UpdatePosition(candidate, p_tree);
           monitor.Stop("UpdatePosition");
 
           monitor.Start("BuildHist");
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 3c93c283917a..ea5556b38fca 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -174,8 +174,6 @@ TEST(GpuHist, ApplySplit) {
     ASSERT_EQ(tree.GetSplitCategories().size(), 1);
     uint32_t bits = 1u << 30;  // bits: 0, 1, 0, 0, 0, ..., 0
     ASSERT_EQ(tree.GetSplitCategories().back(), bits);
-
-    ASSERT_EQ(updater.node_categories.size(), 1);
   }
 }
 

From dc100cfbf5bb10230875680424611a8136cb7996 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 2 May 2022 06:00:24 -0700
Subject: [PATCH 05/64] Refactor categoricals 2

---
 src/common/categorical.h                      |  4 +-
 src/tree/gpu_hist/evaluate_splits.cu          |  4 +-
 src/tree/gpu_hist/evaluate_splits.cuh         | 28 ++++++++-----
 src/tree/gpu_hist/evaluator.cu                | 15 +++----
 src/tree/hist/evaluate_splits.h               |  2 +-
 src/tree/updater_gpu_hist.cu                  |  4 +-
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 31 ++++++++------
 tests/cpp/tree/test_gpu_hist.cu               | 40 -------------------
 8 files changed, 52 insertions(+), 76 deletions(-)

diff --git a/src/common/categorical.h b/src/common/categorical.h
index 5eff62264cf2..341a887f48a9 100644
--- a/src/common/categorical.h
+++ b/src/common/categorical.h
@@ -82,8 +82,8 @@ inline void InvalidCategory() {
 /*!
  * \brief Whether should we use onehot encoding for categorical data.
  */
-XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
-  bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
+XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot) {
+  bool use_one_hot = n_cats < max_cat_to_onehot;
   return use_one_hot;
 }
 
diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 7fba1902b881..2966b84e75af 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -241,7 +241,7 @@ __global__ void EvaluateSplitsKernel(
 
   if (common::IsCat(inputs.feature_types, fidx)) {
     auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx];
-    if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot, task)) {
+    if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot)) {
       EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
                       kOneHot>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
     } else {
@@ -281,7 +281,7 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs<GradientSumT> const &inp
   out_split.split_cats = common::CatBitField{out};
 
   // Simple case for one hot split
-  if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
+  if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) {
     out_split.split_cats.Set(common::AsCat(out_split.fvalue));
     return;
   }
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index f28aac97b417..67e56426217a 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -61,6 +61,9 @@ class GPUHistEvaluator {
   // Do we have any categorical features that require sorting histograms?
   // use this to skip the expensive sort step
   bool need_sort_histogram_ = false;
+  // Number of elements of categorical storage type
+  // needed to hold categoricals for a single mode
+  std::size_t node_categorical_storage_size_ = 0;
 
   // Copy the categories from device to host asynchronously.
   void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
@@ -69,12 +72,17 @@ class GPUHistEvaluator {
    * \brief Get host category storage of nidx for internal calculation.
    */
   auto HostCatStorage(bst_node_t nidx) {
-    auto cat_bits = h_split_cats_.size() / param_.MaxNodes();
+
+    std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
+    if(h_split_cats_.size()<min_size){
+      h_split_cats_.resize(min_size);
+    }
+
     if (nidx == RegTree::kRoot) {
-      auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
+      auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
       return cats_out;
     }
-    auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits * 2);
+    auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
     return cats_out;
   }
 
@@ -82,12 +90,15 @@ class GPUHistEvaluator {
    * \brief Get device category storage of nidx for internal calculation.
    */
   auto DeviceCatStorage(bst_node_t nidx) {
-    auto cat_bits = split_cats_.size() / param_.MaxNodes();
+    std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
+    if(split_cats_.size()<min_size){
+      split_cats_.resize(min_size);
+    }
     if (nidx == RegTree::kRoot) {
-      auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits);
+      auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
       return cats_out;
     }
-    auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits * 2);
+    auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
     return cats_out;
   }
 
@@ -114,7 +125,7 @@ class GPUHistEvaluator {
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
-  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft, ObjInfo task,
+  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
              bst_feature_t n_features, TrainParam const &param, int32_t device);
 
   /**
@@ -123,8 +134,7 @@ class GPUHistEvaluator {
    */
   common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
     copy_stream_.View().Sync();
-    auto cat_bits = h_split_cats_.size() / param_.MaxNodes();
-    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
+    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
     return cats_out;
   }
   /**
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 6c081e1ba6df..777b017be24e 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -16,12 +16,12 @@ namespace xgboost {
 namespace tree {
 template <typename GradientSumT>
 void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
-                                           common::Span<FeatureType const> ft, ObjInfo task,
+                                           common::Span<FeatureType const> ft,
                                            bst_feature_t n_features, TrainParam const &param,
                                            int32_t device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
-  if (cuts.HasCategorical() && !task.UseOneHot()) {
+  if (cuts.HasCategorical()) {
     dh::XGBCachingDeviceAllocator<char> alloc;
     auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
     auto beg = thrust::make_counting_iterator<size_t>(1ul);
@@ -35,21 +35,22 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
           auto idx = i - 1;
           if (common::IsCat(ft, idx)) {
             auto n_bins = ptrs[i] - ptrs[idx];
-            bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
+            bool use_sort = !common::UseOneHot(n_bins, to_onehot);
             return use_sort;
           }
           return false;
         });
 
-    auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
-    CHECK_NE(bit_storage_size, 0);
+    node_categorical_storage_size_ =
+        common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
+    CHECK_NE(node_categorical_storage_size_, 0);
     // We need to allocate for all nodes since the updater can grow the tree layer by
     // layer, all nodes in the same layer must be preserved until that layer is
     // finished.  We can allocate one layer at a time, but the best case is reducing the
     // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
     // times than necessary.
-    split_cats_.resize(param.MaxNodes() * bit_storage_size);
-    h_split_cats_.resize(split_cats_.size());
+    split_cats_.resize(node_categorical_storage_size_);
+    h_split_cats_.resize(node_categorical_storage_size_);
     dh::safe_cuda(
         cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
 
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 4e445a0680e5..8a61ea809c04 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -244,7 +244,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
         }
         if (is_cat) {
           auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
-          if (common::UseOneHot(n_bins, param_.max_cat_to_onehot, task_)) {
+          if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
             EnumerateSplit<+1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
             EnumerateSplit<-1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
           } else {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 861b6e15b264..8ee6f43f78f5 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -241,7 +241,7 @@ struct GPUHistMakerDevice {
                               param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 
-    this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param,
+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
                            ctx_->gpu_id);
 
     this->interaction_constraints.Reset();
@@ -571,7 +571,7 @@ struct GPUHistMakerDevice {
       split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
       CHECK_LE(split_cats.size(), h_cats.size());
       std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
-      
+
       tree.ExpandCategorical(
           candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
           base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(),
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 0cbfc9f2a6cf..2243cb4dda90 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -24,14 +24,16 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
+  common::HistogramCuts cuts;
+  cuts.cut_values_.HostVector() = std::vector<float>{1.0, 2.0, 11.0, 12.0};
+  cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 2, 4};
+  cuts.min_vals_.HostVector() =  std::vector<float>{0.0, 0.0};
+  cuts.cut_ptrs_.SetDevice(0);
+  cuts.cut_values_.SetDevice(0);
+  cuts.min_vals_.SetDevice(0);
   thrust::device_vector<bst_feature_t> feature_set =
       std::vector<bst_feature_t>{0, 1};
-  thrust::device_vector<uint32_t> feature_segments =
-      std::vector<bst_row_t>{0, 2, 4};
-  thrust::device_vector<float> feature_values =
-      std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  thrust::device_vector<float> feature_min_values =
-      std::vector<float>{0.0, 0.0};
+
   // Setup gradients so that second feature gets higher gain
   thrust::device_vector<GradientPair> feature_histogram =
       std::vector<GradientPair>{
@@ -42,21 +44,25 @@ void TestEvaluateSingleSplit(bool is_categorical) {
                                                FeatureType::kCategorical);
   common::Span<FeatureType> d_feature_types;
   if (is_categorical) {
+    auto max_cat = *std::max_element(cuts.cut_values_.HostVector().begin(),
+                                     cuts.cut_values_.HostVector().end());
+    cuts.SetCategorical(true, max_cat);
     d_feature_types = dh::ToSpan(feature_types);
   }
+
   EvaluateSplitInputs<GradientPair> input{1,
                                           parent_sum,
                                           param,
                                           dh::ToSpan(feature_set),
                                           d_feature_types,
-                                          dh::ToSpan(feature_segments),
-                                          dh::ToSpan(feature_values),
-                                          dh::ToSpan(feature_min_values),
+                                          cuts.cut_ptrs_.ConstDeviceSpan(),
+                                          cuts.cut_values_.ConstDeviceSpan(),
+                                          cuts.min_vals_.ConstDeviceSpan(),
                                           dh::ToSpan(feature_histogram)};
 
   GPUHistEvaluator<GradientPair> evaluator{
-      tparam, static_cast<bst_feature_t>(feature_min_values.size()), 0};
-  dh::device_vector<common::CatBitField::value_type> out_cats;
+      tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
   DeviceSplitCandidate result =
       evaluator.EvaluateSingleSplit(input, 0, ObjInfo{ObjInfo::kRegression}).split;
 
@@ -264,8 +270,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   cuts_.cut_values_.SetDevice(0);
   cuts_.min_vals_.SetDevice(0);
 
-  ObjInfo task{ObjInfo::kRegression};
-  evaluator.Reset(cuts_, dh::ToSpan(ft), task, info_.num_col_, param_, 0);
+  evaluator.Reset(cuts_, dh::ToSpan(ft), info_.num_col_, param_, 0);
 
   dh::device_vector<GradientPairPrecise> d_hist(hist_[0].size());
   auto node_hist = hist_[0];
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index ea5556b38fca..2f3cc9c7d950 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -137,46 +137,6 @@ TEST(GpuHist, BuildHistSharedMem) {
   TestBuildHist<GradientPair>(true);
 }
 
-TEST(GpuHist, ApplySplit) {
-  RegTree tree;
-  GPUExpandEntry candidate;
-  candidate.nid = 0;
-  candidate.left_weight = 1.0f;
-  candidate.right_weight = 2.0f;
-  candidate.base_weight = 3.0f;
-  candidate.split.is_cat = true;
-  candidate.split.fvalue = 1.0f;  // at cat 1
-
-  size_t n_rows = 10;
-  size_t n_cols = 10;
-
-  auto m = RandomDataGenerator{n_rows, n_cols, 0}.GenerateDMatrix(true);
-  GenericParameter p;
-  p.InitAllowUnknown(Args{});
-
-  TrainParam tparam;
-  tparam.InitAllowUnknown(Args{});
-  BatchParam bparam;
-  bparam.gpu_id = 0;
-  bparam.max_bin = 3;
-  Context ctx{CreateEmptyGenericParam(0)};
-
-  for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){
-    auto impl = ellpack.Impl();
-    HostDeviceVector<FeatureType> feature_types(10, FeatureType::kCategorical);
-    feature_types.SetDevice(bparam.gpu_id);
-    tree::GPUHistMakerDevice<GradientPairPrecise> updater(
-        &ctx, impl, feature_types.ConstDeviceSpan(), n_rows, tparam, 0, n_cols, bparam);
-    updater.ApplySplit(candidate, &tree);
-
-    ASSERT_EQ(tree.GetSplitTypes().size(), 3);
-    ASSERT_EQ(tree.GetSplitTypes()[0], FeatureType::kCategorical);
-    ASSERT_EQ(tree.GetSplitCategories().size(), 1);
-    uint32_t bits = 1u << 30;  // bits: 0, 1, 0, 0, 0, ..., 0
-    ASSERT_EQ(tree.GetSplitCategories().back(), bits);
-  }
-}
-
 HistogramCutsWrapper GetHostCutMatrix () {
   HistogramCutsWrapper cmat;
   cmat.SetPtrs({0, 3, 6, 9, 12, 15, 18, 21, 24});

From bc744585f7832381fd525090718286b7f93b6d09 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 2 May 2022 06:21:51 -0700
Subject: [PATCH 06/64] Skip copy if no categoricals

---
 src/tree/gpu_hist/evaluate_splits.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 2966b84e75af..5326b103d2d7 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -361,6 +361,7 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
 template <typename GradientSumT>
 void GPUHistEvaluator<GradientSumT>::CopyToHost(EvaluateSplitInputs<GradientSumT> const &input,
                                                 common::Span<CatST> cats_out) {
+  if (cats_out.empty()) return;
   dh::CUDAEvent event;
   event.Record(dh::DefaultStream());
   auto h_cats = this->HostCatStorage(input.nidx);

From c4f8eac8996262d8447e73ca24b88569e34fc5c2 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 5 May 2022 04:35:32 -0700
Subject: [PATCH 07/64] Review comment

---
 .gitignore                     | 5 ++++-
 src/tree/gpu_hist/evaluator.cu | 5 -----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index e847342b19bd..20b92c057e1a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,4 +130,7 @@ credentials.csv
 # Visual Studio code + extensions
 .vscode
 .metals
-.bloop
\ No newline at end of file
+.bloop
+
+# hypothesis python tests
+.hypothesis
\ No newline at end of file
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 777b017be24e..aaf35243b2f5 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -44,11 +44,6 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
     node_categorical_storage_size_ =
         common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
     CHECK_NE(node_categorical_storage_size_, 0);
-    // We need to allocate for all nodes since the updater can grow the tree layer by
-    // layer, all nodes in the same layer must be preserved until that layer is
-    // finished.  We can allocate one layer at a time, but the best case is reducing the
-    // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
-    // times than necessary.
     split_cats_.resize(node_categorical_storage_size_);
     h_split_cats_.resize(node_categorical_storage_size_);
     dh::safe_cuda(

From a1cddaabbf93bb0be86bfc293dea5a84e233d719 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 5 May 2022 07:30:55 -0700
Subject: [PATCH 08/64] Revert "Categoricals broken"

This reverts commit 80a3e78f9e1dcbf2a78f6572897453c61afd60b0.
---
 src/tree/gpu_hist/evaluate_splits.cuh     |   2 +-
 src/tree/gpu_hist/evaluator.cu            |  73 ++++---
 src/tree/gpu_hist/histogram.cu            |   9 +
 src/tree/updater_gpu_hist.cu              | 223 +++++++++-------------
 tests/cpp/tree/gpu_hist/test_histogram.cu |   1 +
 tests/cpp/tree/test_gpu_hist.cu           |  45 ++---
 6 files changed, 167 insertions(+), 186 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 7d792051e5be..b03fd7b41b51 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -92,7 +92,7 @@ class GPUHistEvaluator {
   }
 
   /**
-   * \brief Get sorted index storage based on the left node of inputs.
+   * \brief Get sorted index storage based on the left node of inputs .
    */
   auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
     if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
index 381ef8fbb349..bc2027489131 100644
--- a/src/tree/gpu_hist/evaluator.cu
+++ b/src/tree/gpu_hist/evaluator.cu
@@ -21,36 +21,55 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
                                            int32_t device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
-  if (cuts.HasCategorical()) {
+  if (cuts.HasCategorical() && !task.UseOneHot()) {
     dh::XGBCachingDeviceAllocator<char> alloc;
-    auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
-    CHECK_NE(bit_storage_size, 0);
-    // We need to allocate for all nodes since the updater can grow the tree layer by
-    // layer, all nodes in the same layer must be preserved until that layer is
-    // finished.  We can allocate one layer at a time, but the best case is reducing the
-    // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
-    // times than necessary.
-    split_cats_.resize(param.MaxNodes() * bit_storage_size);
-    h_split_cats_.resize(split_cats_.size());
-    dh::safe_cuda(
-        cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
+    auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
+    auto beg = thrust::make_counting_iterator<size_t>(1ul);
+    auto end = thrust::make_counting_iterator<size_t>(ptrs.size());
+    auto to_onehot = param.max_cat_to_onehot;
+    // This condition avoids sort-based split function calls if the users want
+    // onehot-encoding-based splits.
+    // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
+    has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
+      auto idx = i - 1;
+      if (common::IsCat(ft, idx)) {
+        auto n_bins = ptrs[i] - ptrs[idx];
+        bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
+        return use_sort;
+      }
+      return false;
+    });
 
-    cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
-    sort_input_.resize(cat_sorted_idx_.size());
+    if (has_sort_) {
+      auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
+      CHECK_NE(bit_storage_size, 0);
+      // We need to allocate for all nodes since the updater can grow the tree layer by
+      // layer, all nodes in the same layer must be preserved until that layer is
+      // finished.  We can allocate one layer at a time, but the best case is reducing the
+      // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
+      // times than necessary.
+      split_cats_.resize(param.MaxNodes() * bit_storage_size);
+      h_split_cats_.resize(split_cats_.size());
+      dh::safe_cuda(
+          cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
 
-    /**
-     * cache feature index binary search result
-     */
-    feature_idx_.resize(cat_sorted_idx_.size());
-    auto d_fidxes = dh::ToSpan(feature_idx_);
-    auto it = thrust::make_counting_iterator(0ul);
-    auto values = cuts.cut_values_.ConstDeviceSpan();
-    auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
-    thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
-                      [=] XGBOOST_DEVICE(size_t i) {
-                        auto fidx = dh::SegmentId(ptrs, i);
-                        return fidx;
-                      });
+      cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
+      sort_input_.resize(cat_sorted_idx_.size());
+
+      /**
+       * cache feature index binary search result
+       */
+      feature_idx_.resize(cat_sorted_idx_.size());
+      auto d_fidxes = dh::ToSpan(feature_idx_);
+      auto it = thrust::make_counting_iterator(0ul);
+      auto values = cuts.cut_values_.ConstDeviceSpan();
+      auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
+      thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(),
+                        feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) {
+                          auto fidx = dh::SegmentId(ptrs, i);
+                          return fidx;
+                        });
+    }
   }
 }
 
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index efb08d5e44e2..791363a05cdd 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -247,6 +247,15 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
   dh::safe_cuda(cudaGetLastError());
 }
 
+template void BuildGradientHistogram<GradientPair>(
+    EllpackDeviceAccessor const& matrix,
+    FeatureGroupsAccessor const& feature_groups,
+    common::Span<GradientPair const> gpair,
+    common::Span<const uint32_t> ridx,
+    common::Span<GradientPair> histogram,
+    HistRounding<GradientPair> rounding,
+    bool force_global_memory);
+
 template void BuildGradientHistogram<GradientPairPrecise>(
     EllpackDeviceAccessor const& matrix,
     FeatureGroupsAccessor const& feature_groups,
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 2cd9d4babeb1..2340687983a8 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -57,7 +57,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
 #endif  // !defined(GTEST_TEST)
 
 /**
- * \struct  DeviceHistogramStorage
+ * \struct  DeviceHistogram
  *
  * \summary Data storage for node histograms on device. Automatically expands.
  *
@@ -67,18 +67,12 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
  * \author  Rory
  * \date    28/07/2018
  */
-template <typename GradientSumT, size_t kStopGrowingSize = 1 << 28>
-class DeviceHistogramStorage {
+template <typename GradientSumT, size_t kStopGrowingSize = 1 << 26>
+class DeviceHistogram {
  private:
   /*! \brief Map nidx to starting index of its histogram. */
   std::map<int, size_t> nidx_map_;
-  // Large buffer of zeroed memory, caches histograms
   dh::device_vector<typename GradientSumT::ValueT> data_;
-  // If we run out of storage allocate one histogram at a time
-  // in overflow. Not cached, overwritten when a new histogram
-  // is requested
-  dh::device_vector<typename GradientSumT::ValueT> overflow_;
-  std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
   int device_id_;
   static constexpr size_t kNumItemsInGradientSum =
@@ -87,8 +81,6 @@ class DeviceHistogramStorage {
                 "Number of items in gradient type should be 2.");
 
  public:
-  // Start with about 16mb
-  DeviceHistogramStorage() { data_.reserve(1 << 22); }
   void Init(int device_id, int n_bins) {
     this->n_bins_ = n_bins;
     this->device_id_ = device_id;
@@ -99,53 +91,21 @@ class DeviceHistogramStorage {
     dh::LaunchN(data_.size(),
                 [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     nidx_map_.clear();
-    overflow_nidx_map_.clear();
   }
   bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
+    return nidx_map_.find(nidx) != nidx_map_.cend();
   }
   int Bins() const {
     return n_bins_;
   }
-  size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
-  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
-
-  void AllocateHistograms(const std::vector<int>& new_nidxs) {
-    for (int nidx : new_nidxs) {
-      CHECK(!HistogramExists(nidx));
-    }
-    // Number of items currently used in data
-    const size_t used_size = nidx_map_.size() * HistogramSize();
-    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
-    if (used_size >= kStopGrowingSize) {
-      // Use overflow
-      // Delete previous entries
-      overflow_nidx_map_.clear();
-      overflow_.resize(HistogramSize() * new_nidxs.size());
-      // Zero memory
-      auto d_data = overflow_.data().get();
-      dh::LaunchN(overflow_.size(),
-                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
-      }
-    } else {
-      CHECK_GE(data_.size(), used_size);
-      // Expand if necessary
-      if (data_.size() < new_used_size) {
-        data_.resize(std::max(data_.size() * 2, new_used_size));
-      }
-      // Append new histograms
-      for (int nidx : new_nidxs) {
-        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
-      }
-    }
+  size_t HistogramSize() const {
+    return n_bins_ * kNumItemsInGradientSum;
+  }
 
-    CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
+  dh::device_vector<typename GradientSumT::ValueT>& Data() {
+    return data_;
   }
 
-  /*
   void AllocateHistogram(int nidx) {
     if (HistogramExists(nidx)) return;
     // Number of items currently used in data
@@ -179,7 +139,6 @@ class DeviceHistogramStorage {
 
     CHECK_GE(data_.size(), nidx_map_.size() * HistogramSize());
   }
-    */
 
   /**
    * \summary   Return pointer to histogram memory for a given node.
@@ -188,16 +147,9 @@ class DeviceHistogramStorage {
    */
   common::Span<GradientSumT> GetNodeHistogram(int nidx) {
     CHECK(this->HistogramExists(nidx));
-
-    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
-      // Fetch from normal cache
-      auto ptr = data_.data().get() + nidx_map_.at(nidx);
-      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
-    } else {
-      // Fetch from overflow
-      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
-      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
-    }
+    auto ptr = data_.data().get() + nidx_map_.at(nidx);
+    return common::Span<GradientSumT>(
+        reinterpret_cast<GradientSumT*>(ptr), n_bins_);
   }
 };
 
@@ -214,7 +166,7 @@ struct GPUHistMakerDevice {
   BatchParam batch_param;
 
   std::unique_ptr<RowPartitioner> row_partitioner;
-  DeviceHistogramStorage<GradientSumT> hist{};
+  DeviceHistogram<GradientSumT> hist{};
 
   dh::caching_device_vector<GradientPair> d_gpair;  // storage for gpair;
   common::Span<GradientPair> gpair;
@@ -237,6 +189,8 @@ struct GPUHistMakerDevice {
   std::unique_ptr<GradientBasedSampler> sampler;
 
   std::unique_ptr<FeatureGroups> feature_groups;
+  // Storing split categories for last node.
+  dh::caching_device_vector<uint32_t> node_categories;
 
   GPUHistMakerDevice(int _device_id, EllpackPageImpl const* _page,
                      common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
@@ -365,6 +319,7 @@ struct GPUHistMakerDevice {
   }
 
   void BuildHist(int nidx) {
+    hist.AllocateHistogram(nidx);
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
     BuildGradientHistogram(page->GetDeviceAccessor(device_id),
@@ -372,12 +327,8 @@ struct GPUHistMakerDevice {
                            d_ridx, d_node_hist, histogram_rounding);
   }
 
-  // Attempt to the subtraction trick
-  // return true if succeeded
-  bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
-    if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) {
-      return false;
-    }
+  void SubtractionTrick(int nidx_parent, int nidx_histogram,
+                        int nidx_subtraction) {
     auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent);
     auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
     auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
@@ -386,18 +337,22 @@ struct GPUHistMakerDevice {
       d_node_hist_subtraction[idx] =
           d_node_hist_parent[idx] - d_node_hist_histogram[idx];
     });
-    return true;
   }
 
-  void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
-    RegTree::Node split_node = (*p_tree)[e.nid];
-    auto split_type = p_tree->NodeSplitType(e.nid);
+  bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
+    // Make sure histograms are already allocated
+    hist.AllocateHistogram(nidx_subtraction);
+    return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
+  }
+
+  void UpdatePosition(int nidx, RegTree* p_tree) {
+    RegTree::Node split_node = (*p_tree)[nidx];
+    auto split_type = p_tree->NodeSplitType(nidx);
     auto d_matrix = page->GetDeviceAccessor(device_id);
-    auto node_cats = e.split.split_cats.Bits();
-    
+    auto node_cats = dh::ToSpan(node_categories);
 
     row_partitioner->UpdatePosition(
-        e.nid, split_node.LeftChild(), split_node.RightChild(),
+        nidx, split_node.LeftChild(), split_node.RightChild(),
         [=] __device__(bst_uint ridx) {
           // given a row index, returns the node id it belongs to
           bst_float cut_value =
@@ -528,15 +483,13 @@ struct GPUHistMakerDevice {
     row_partitioner.reset();
   }
 
-  // num histograms is the number of contiguous histograms in memory to reduce over
-  void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) {
+  void AllReduceHist(int nidx, dh::AllReducer* reducer) {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
-    reducer->AllReduceSum(reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-                          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-                          page->Cuts().TotalBins() *
-                              (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) *
-                              num_histograms);
+    reducer->AllReduceSum(
+        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+        page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
 
     monitor.Stop("AllReduce");
   }
@@ -544,49 +497,33 @@ struct GPUHistMakerDevice {
   /**
    * \brief Build GPU local histograms for the left and right child of some parent node
    */
-  void BuildHistLeftRight(std::vector<GPUExpandEntry>const  &candidates, dh::AllReducer* reducer, const RegTree& tree) {
-    if(candidates.empty()) return;
-    // Some nodes we will manually compute histograms
-    // others we will do by subtraction
-    std::vector<int> hist_nidx;
-    std::vector<int> subtraction_nidx;
-    for (auto& e : candidates) {
-      // Decide whether to build the left histogram or right histogram
-      // Use sum of Hessian as a heuristic to select node with fewest training instances
-      bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess();
-      if (fewer_right) {
-        hist_nidx.emplace_back(tree[e.nid].RightChild());
-        subtraction_nidx.emplace_back(tree[e.nid].LeftChild());
-      } else {
-        hist_nidx.emplace_back(tree[e.nid].LeftChild());
-        subtraction_nidx.emplace_back(tree[e.nid].RightChild());
-      }
-    }
-    std::vector<int> all_new = hist_nidx;
-    all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
-    // Allocate the histograms
-    // Guaranteed contiguous memory
-    hist.AllocateHistograms(all_new);
-
-    for(auto nidx:hist_nidx){
-      this->BuildHist(nidx);
+  void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left,
+        int nidx_right, dh::AllReducer* reducer) {
+    auto build_hist_nidx = nidx_left;
+    auto subtraction_trick_nidx = nidx_right;
+
+    // Decide whether to build the left histogram or right histogram
+    // Use sum of Hessian as a heuristic to select node with fewest training instances
+    bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess();
+    if (fewer_right) {
+      std::swap(build_hist_nidx, subtraction_trick_nidx);
     }
 
-    // Reduce all in one go
-    // This gives much better latency in a distributed setting
-    // when processing a large batch
-    this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size());
+    this->BuildHist(build_hist_nidx);
+    this->AllReduceHist(build_hist_nidx, reducer);
 
-    for (int i = 0; i < subtraction_nidx.size(); i++) {
-      auto build_hist_nidx = hist_nidx.at(i);
-      auto subtraction_trick_nidx = subtraction_nidx.at(i);
-      auto parent_nidx = candidates.at(i).nid;
+    // Check whether we can use the subtraction trick to calculate the other
+    bool do_subtraction_trick = this->CanDoSubtractionTrick(
+        candidate.nid, build_hist_nidx, subtraction_trick_nidx);
 
-      if(!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)){
-        // Calculate other histogram manually
-        this->BuildHist(subtraction_trick_nidx);
-        this->AllReduceHist(subtraction_trick_nidx, reducer, 1);
-      }
+    if (do_subtraction_trick) {
+      // Calculate other histogram using subtraction trick
+      this->SubtractionTrick(candidate.nid, build_hist_nidx,
+                             subtraction_trick_nidx);
+    } else {
+      // Calculate other histogram manually
+      this->BuildHist(subtraction_trick_nidx);
+      this->AllReduceHist(subtraction_trick_nidx, reducer);
     }
   }
 
@@ -609,11 +546,27 @@ struct GPUHistMakerDevice {
       CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
           << "Categorical feature value too large.";
       std::vector<uint32_t> split_cats;
-      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
-      auto max_cat = candidate.split.MaxCat();
-      split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
-      CHECK_LE(split_cats.size(), h_cats.size());
-      std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
+      if (candidate.split.split_cats.Bits().empty()) {
+        if (common::InvalidCat(candidate.split.fvalue)) {
+          common::InvalidCategory();
+        }
+        auto cat = common::AsCat(candidate.split.fvalue);
+        split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0);
+        common::CatBitField cats_bits(split_cats);
+        cats_bits.Set(cat);
+        dh::CopyToD(split_cats, &node_categories);
+      } else {
+        auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
+        auto max_cat = candidate.split.MaxCat();
+        split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
+        CHECK_LE(split_cats.size(), h_cats.size());
+        std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
+
+        node_categories.resize(candidate.split.split_cats.Bits().size());
+        dh::safe_cuda(cudaMemcpyAsync(
+            node_categories.data().get(), candidate.split.split_cats.Data(),
+            candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice));
+      }
 
       tree.ExpandCategorical(
           candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
@@ -645,9 +598,8 @@ struct GPUHistMakerDevice {
                    GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
     rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);
 
-    hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
-    this->AllReduceHist(kRootNIdx, reducer, 1);
+    this->AllReduceHist(kRootNIdx, reducer);
 
     // Remember root stats
     node_sum_gradients[kRootNIdx] = root_sum;
@@ -686,7 +638,6 @@ struct GPUHistMakerDevice {
       std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
                    [&](const auto& e) { return driver.IsChildValid(e); });
 
-
       auto new_candidates =
           pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
 
@@ -695,16 +646,22 @@ struct GPUHistMakerDevice {
         // Update position is only run when child is valid, instead of right after apply
         // split (as in approx tree method).  Hense we have the finalise position call
         // in GPU Hist.
-        this->UpdatePosition(e, p_tree);
+        this->UpdatePosition(e.nid, p_tree);
         monitor.Stop("UpdatePosition");
       }
 
-      monitor.Start("BuildHist");
-      this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
-      monitor.Stop("BuildHist");
+      for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
+        auto candidate = expand_set.at(i);
+        int left_child_nidx = tree[candidate.nid].LeftChild();
+        int right_child_nidx = tree[candidate.nid].RightChild();
+
+        monitor.Start("BuildHist");
+        this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
+        monitor.Stop("BuildHist");
+      }
 
       for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
-        auto candidate = filtered_expand_set.at(i);
+        auto candidate = expand_set.at(i);
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
 
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 75d97b681a61..3b543a48d7cc 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -95,6 +95,7 @@ TEST(Histogram, GPUDeterministic) {
   std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
   for (bool is_dense : is_dense_array) {
     for (int shm_size : shm_sizes) {
+      TestDeterministicHistogram<GradientPair>(is_dense, shm_size);
       TestDeterministicHistogram<GradientPairPrecise>(is_dense, shm_size);
     }
   }
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index bdabbbcb38c2..883537863307 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -29,38 +29,29 @@ TEST(GpuHist, DeviceHistogram) {
   constexpr size_t kNBins = 128;
   constexpr size_t kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogramStorage<GradientPairPrecise, kStopGrowing> histogram;
+  DeviceHistogram<GradientPairPrecise, kStopGrowing> histogram;
   histogram.Init(0, kNBins);
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
+  for (size_t i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistogram(i);
   }
   histogram.Reset();
   ASSERT_EQ(histogram.Data().size(), kStopGrowing);
 
   // Use allocated memory but do not erase nidx_map.
-  for (int i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistograms({i});
+  for (size_t i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistogram(i);
   }
-  for (int i = 0; i < kNNodes; ++i) {
+  for (size_t i = 0; i < kNNodes; ++i) {
     ASSERT_TRUE(histogram.HistogramExists(i));
   }
 
-  // Add two new nodes
-  histogram.AllocateHistograms({kNNodes});
-  histogram.AllocateHistograms({kNNodes+1});
-
-  // Old cached nodes should still exist
-  for (int i = 0; i < kNNodes; ++i) {
-    ASSERT_TRUE(histogram.HistogramExists(i));
+  // Erase existing nidx_map.
+  for (size_t i = kNNodes; i < kNNodes * 2; ++i) {
+    histogram.AllocateHistogram(i);
+  }
+  for (size_t i = 0; i < kNNodes; ++i) {
+    ASSERT_FALSE(histogram.HistogramExists(i));
   }
-
-  // Should be deleted
-  ASSERT_FALSE(histogram.HistogramExists({kNNodes}));
-  // Most recent node should exist
-  ASSERT_TRUE(histogram.HistogramExists({kNNodes + 1}));
-
-  // Add same node again - should fail
-  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes+1}););
 }
 
 std::vector<GradientPairPrecise> GetHostHistGpair() {
@@ -104,9 +95,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
   maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
-  maker.hist.AllocateHistograms({0});
+  maker.hist.AllocateHistogram(0);
   maker.gpair = gpair.DeviceSpan();
-  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);
+  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);;
 
   BuildGradientHistogram(
       page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0),
@@ -114,7 +105,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
       maker.hist.GetNodeHistogram(0), maker.histogram_rounding,
       !use_shared_memory_histograms);
 
-  DeviceHistogramStorage<GradientSumT>& d_hist = maker.hist;
+  DeviceHistogram<GradientSumT>& d_hist = maker.hist;
 
   auto node_histogram = d_hist.GetNodeHistogram(0);
   // d_hist.data stored in float, not gradient pair
@@ -137,10 +128,12 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
 TEST(GpuHist, BuildHistGlobalMem) {
   TestBuildHist<GradientPairPrecise>(false);
+  TestBuildHist<GradientPair>(false);
 }
 
 TEST(GpuHist, BuildHistSharedMem) {
   TestBuildHist<GradientPairPrecise>(true);
+  TestBuildHist<GradientPair>(true);
 }
 
 TEST(GpuHist, ApplySplit) {
@@ -180,6 +173,8 @@ TEST(GpuHist, ApplySplit) {
     ASSERT_EQ(tree.GetSplitCategories().size(), 1);
     uint32_t bits = 1u << 30;  // bits: 0, 1, 0, 0, 0, ..., 0
     ASSERT_EQ(tree.GetSplitCategories().back(), bits);
+
+    ASSERT_EQ(updater.node_categories.size(), 1);
   }
 }
 
@@ -243,7 +238,7 @@ TEST(GpuHist, EvaluateRootSplit) {
 
   // Initialize GPUHistMakerDevice::hist
   maker.hist.Init(0, (max_bins - 1) * kNCols);
-  maker.hist.AllocateHistograms({0});
+  maker.hist.AllocateHistogram(0);
   // Each row of hist_gpair represents gpairs for one feature.
   // Each entry represents a bin.
   std::vector<GradientPairPrecise> hist_gpair = GetHostHistGpair();

From fd0e25e0bd2cf05f33766c7b1deb1471126f9447 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 5 May 2022 08:49:40 -0700
Subject: [PATCH 09/64] Lint

---
 src/tree/driver.h                 | 4 ++--
 src/tree/updater_approx.cc        | 8 +-------
 src/tree/updater_gpu_hist.cu      | 4 ++--
 src/tree/updater_quantile_hist.cc | 7 +------
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/src/tree/driver.h b/src/tree/driver.h
index 1e40cc32622f..e61255e043c7 100644
--- a/src/tree/driver.h
+++ b/src/tree/driver.h
@@ -57,7 +57,7 @@ class Driver {
 
   // Can a child of this entry still be expanded?
   // can be used to avoid extra work
-  bool IsChildValid(ExpandEntryT const& parent_entry){
+  bool IsChildValid(ExpandEntryT const& parent_entry) {
     if (param_.max_depth > 0 && parent_entry.depth + 1 >= param_.max_depth) return false;
     if (param_.max_leaves > 0 && num_leaves_ >= param_.max_leaves) return false;
     return true;
@@ -100,7 +100,7 @@ class Driver {
 
  private:
   TrainParam param_;
-  std::size_t num_leaves_=1;
+  std::size_t num_leaves_ = 1;
   ExpandQueue queue_;
 };
 }  // namespace tree
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index fc05aed0a3ee..99e7cf738200 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -184,7 +184,6 @@ class GloablApproxBuilder {
     Driver<CPUExpandEntry> driver(param_);
     auto &tree = *p_tree;
     driver.Push({this->InitRoot(p_fmat, gpair, hess, p_tree)});
-    bst_node_t num_leaves{1};
     auto expand_set = driver.Pop();
 
     /**
@@ -203,14 +202,9 @@ class GloablApproxBuilder {
       // candidates that can be applied.
       std::vector<CPUExpandEntry> applied;
       for (auto const &candidate : expand_set) {
-        if (!candidate.IsValid(param_, num_leaves)) {
-          continue;
-        }
         evaluator_.ApplyTreeSplit(candidate, p_tree);
         applied.push_back(candidate);
-        num_leaves++;
-        int left_child_nidx = tree[candidate.nid].LeftChild();
-        if (CPUExpandEntry::ChildIsValid(param_, p_tree->GetDepth(left_child_nidx), num_leaves)) {
+        if (driver.IsChildValid(candidate)) {
           valid_candidates.emplace_back(candidate);
         }
       }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 07f1499e213f..634f2969a090 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -637,7 +637,7 @@ struct GPUHistMakerDevice {
     // The set of leaves that can be expanded asynchronously
     auto expand_set = driver.Pop();
     while (!expand_set.empty()) {
-      for(auto & candidate: expand_set){
+      for (auto& candidate : expand_set) {
         this->ApplySplit(candidate, p_tree);
       }
       // Get the candidates we are allowed to expand further
@@ -649,7 +649,7 @@ struct GPUHistMakerDevice {
       auto new_candidates =
           pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
 
-      for(const auto &e:filtered_expand_set){
+      for (const auto& e : filtered_expand_set) {
         monitor.Start("UpdatePosition");
         // Update position is only run when child is valid, instead of right after apply
         // split (as in approx tree method).  Hense we have the finalise position call
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index c69f8c8dba0b..ed3dff67295a 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -196,7 +196,6 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
   Driver<CPUExpandEntry> driver(param_);
   driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
   auto const &tree = *p_tree;
-  bst_node_t num_leaves{1};
   auto expand_set = driver.Pop();
 
   while (!expand_set.empty()) {
@@ -206,13 +205,9 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
     std::vector<CPUExpandEntry> applied;
     int32_t depth = expand_set.front().depth + 1;
     for (auto const& candidate : expand_set) {
-      if (!candidate.IsValid(param_, num_leaves)) {
-        continue;
-      }
       evaluator_->ApplyTreeSplit(candidate, p_tree);
       applied.push_back(candidate);
-      num_leaves++;
-      if (CPUExpandEntry::ChildIsValid(param_, depth, num_leaves)) {
+      if (driver.IsChildValid(candidate)) {
         valid_candidates.emplace_back(candidate);
       }
     }

From 56785f3168c26a248572e6edd0f6c8b8c2885bde Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 6 May 2022 05:05:15 -0700
Subject: [PATCH 10/64] Revert "Revert "Categoricals broken""

This reverts commit a1cddaabbf93bb0be86bfc293dea5a84e233d719.
---
 src/tree/gpu_hist/evaluate_splits.cuh     |   2 +-
 src/tree/gpu_hist/histogram.cu            |   9 -
 src/tree/updater_gpu_hist.cu              | 202 ++++++++++++----------
 tests/cpp/tree/gpu_hist/test_histogram.cu |   1 -
 tests/cpp/tree/test_gpu_hist.cu           |  43 +++--
 tests/python-gpu/test_gpu_updaters.py     |   2 +-
 6 files changed, 140 insertions(+), 119 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 8d5cc809a280..08b0270ee4d7 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -103,7 +103,7 @@ class GPUHistEvaluator {
   }
 
   /**
-   * \brief Get sorted index storage based on the left node of inputs .
+   * \brief Get sorted index storage based on the left node of inputs.
    */
   auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
     if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 791363a05cdd..efb08d5e44e2 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -247,15 +247,6 @@ void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
   dh::safe_cuda(cudaGetLastError());
 }
 
-template void BuildGradientHistogram<GradientPair>(
-    EllpackDeviceAccessor const& matrix,
-    FeatureGroupsAccessor const& feature_groups,
-    common::Span<GradientPair const> gpair,
-    common::Span<const uint32_t> ridx,
-    common::Span<GradientPair> histogram,
-    HistRounding<GradientPair> rounding,
-    bool force_global_memory);
-
 template void BuildGradientHistogram<GradientPairPrecise>(
     EllpackDeviceAccessor const& matrix,
     FeatureGroupsAccessor const& feature_groups,
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 634f2969a090..964a486baf16 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -62,7 +62,7 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
 #endif  // !defined(GTEST_TEST)
 
 /**
- * \struct  DeviceHistogram
+ * \struct  DeviceHistogramStorage
  *
  * \summary Data storage for node histograms on device. Automatically expands.
  *
@@ -72,12 +72,18 @@ DMLC_REGISTER_PARAMETER(GPUHistMakerTrainParam);
  * \author  Rory
  * \date    28/07/2018
  */
-template <typename GradientSumT, size_t kStopGrowingSize = 1 << 26>
-class DeviceHistogram {
+template <typename GradientSumT, size_t kStopGrowingSize = 1 << 28>
+class DeviceHistogramStorage {
  private:
   /*! \brief Map nidx to starting index of its histogram. */
   std::map<int, size_t> nidx_map_;
+  // Large buffer of zeroed memory, caches histograms
   dh::device_vector<typename GradientSumT::ValueT> data_;
+  // If we run out of storage allocate one histogram at a time
+  // in overflow. Not cached, overwritten when a new histogram
+  // is requested
+  dh::device_vector<typename GradientSumT::ValueT> overflow_;
+  std::map<int, size_t> overflow_nidx_map_;
   int n_bins_;
   int device_id_;
   static constexpr size_t kNumItemsInGradientSum =
@@ -86,6 +92,8 @@ class DeviceHistogram {
                 "Number of items in gradient type should be 2.");
 
  public:
+  // Start with about 16mb
+  DeviceHistogramStorage() { data_.reserve(1 << 22); }
   void Init(int device_id, int n_bins) {
     this->n_bins_ = n_bins;
     this->device_id_ = device_id;
@@ -93,52 +101,48 @@ class DeviceHistogram {
 
   void Reset() {
     auto d_data = data_.data().get();
-    dh::LaunchN(data_.size(),
-                [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
+    dh::LaunchN(data_.size(), [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     nidx_map_.clear();
+    overflow_nidx_map_.clear();
   }
   bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend();
+    return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
   }
   int Bins() const {
     return n_bins_;
   }
-  size_t HistogramSize() const {
-    return n_bins_ * kNumItemsInGradientSum;
-  }
-
-  dh::device_vector<typename GradientSumT::ValueT>& Data() {
-    return data_;
-  }
+  size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
+  dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
 
-  void AllocateHistogram(int nidx) {
-    if (HistogramExists(nidx)) return;
+  void AllocateHistograms(const std::vector<int>& new_nidxs) {
+    for (int nidx : new_nidxs) {
+      CHECK(!HistogramExists(nidx));
+    }
     // Number of items currently used in data
     const size_t used_size = nidx_map_.size() * HistogramSize();
-    const size_t new_used_size = used_size + HistogramSize();
-    if (data_.size() >= kStopGrowingSize) {
-      // Recycle histogram memory
-      if (new_used_size <= data_.size()) {
-        // no need to remove old node, just insert the new one.
-        nidx_map_[nidx] = used_size;
-        // memset histogram size in bytes
-      } else {
-        std::pair<int, size_t> old_entry = *nidx_map_.begin();
-        nidx_map_.erase(old_entry.first);
-        nidx_map_[nidx] = old_entry.second;
+    const size_t new_used_size = used_size + HistogramSize() * new_nidxs.size();
+    if (used_size >= kStopGrowingSize) {
+      // Use overflow
+      // Delete previous entries
+      overflow_nidx_map_.clear();
+      overflow_.resize(HistogramSize() * new_nidxs.size());
+      // Zero memory
+      auto d_data = overflow_.data().get();
+      dh::LaunchN(overflow_.size(),
+                  [=] __device__(size_t idx) { d_data[idx] = 0.0; });
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        overflow_nidx_map_[nidx] = overflow_nidx_map_.size() * HistogramSize();
       }
-      // Zero recycled memory
-      auto d_data = data_.data().get() + nidx_map_[nidx];
-      dh::LaunchN(n_bins_ * 2,
-                  [=] __device__(size_t idx) { d_data[idx] = 0.0f; });
     } else {
-      // Append new node histogram
-      nidx_map_[nidx] = used_size;
-      // Check there is enough memory for another histogram node
-      if (data_.size() < new_used_size + HistogramSize()) {
-        size_t new_required_memory =
-            std::max(data_.size() * 2, HistogramSize());
-        data_.resize(new_required_memory);
+      CHECK_GE(data_.size(), used_size);
+      // Expand if necessary
+      if (data_.size() < new_used_size) {
+        data_.resize(std::max(data_.size() * 2, new_used_size));
+      }
+      // Append new histograms
+      for (int nidx : new_nidxs) {
+        nidx_map_[nidx] = nidx_map_.size() * HistogramSize();
       }
     }
 
@@ -152,9 +156,16 @@ class DeviceHistogram {
    */
   common::Span<GradientSumT> GetNodeHistogram(int nidx) {
     CHECK(this->HistogramExists(nidx));
-    auto ptr = data_.data().get() + nidx_map_.at(nidx);
-    return common::Span<GradientSumT>(
-        reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+
+    if (nidx_map_.find(nidx) != nidx_map_.cend()) {
+      // Fetch from normal cache
+      auto ptr = data_.data().get() + nidx_map_.at(nidx);
+      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+    } else {
+      // Fetch from overflow
+      auto ptr = overflow_.data().get() + overflow_nidx_map_.at(nidx);
+      return common::Span<GradientSumT>(reinterpret_cast<GradientSumT*>(ptr), n_bins_);
+    }
   }
 };
 
@@ -171,7 +182,7 @@ struct GPUHistMakerDevice {
   BatchParam batch_param;
 
   std::unique_ptr<RowPartitioner> row_partitioner;
-  DeviceHistogram<GradientSumT> hist{};
+  DeviceHistogramStorage<GradientSumT> hist{};
 
   dh::caching_device_vector<GradientPair> d_gpair;  // storage for gpair;
   common::Span<GradientPair> gpair;
@@ -322,7 +333,6 @@ struct GPUHistMakerDevice {
   }
 
   void BuildHist(int nidx) {
-    hist.AllocateHistogram(nidx);
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
     BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id),
@@ -330,8 +340,12 @@ struct GPUHistMakerDevice {
                            d_ridx, d_node_hist, histogram_rounding);
   }
 
-  void SubtractionTrick(int nidx_parent, int nidx_histogram,
-                        int nidx_subtraction) {
+  // Attempt to do subtraction trick
+  // return true if succeeded
+  bool SubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
+    if (!hist.HistogramExists(nidx_histogram) || !hist.HistogramExists(nidx_parent)) {
+      return false;
+    }
     auto d_node_hist_parent = hist.GetNodeHistogram(nidx_parent);
     auto d_node_hist_histogram = hist.GetNodeHistogram(nidx_histogram);
     auto d_node_hist_subtraction = hist.GetNodeHistogram(nidx_subtraction);
@@ -340,12 +354,7 @@ struct GPUHistMakerDevice {
       d_node_hist_subtraction[idx] =
           d_node_hist_parent[idx] - d_node_hist_histogram[idx];
     });
-  }
-
-  bool CanDoSubtractionTrick(int nidx_parent, int nidx_histogram, int nidx_subtraction) {
-    // Make sure histograms are already allocated
-    hist.AllocateHistogram(nidx_subtraction);
-    return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
+    return true;
   }
 
   void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
@@ -505,13 +514,15 @@ struct GPUHistMakerDevice {
     row_partitioner.reset();
   }
 
-  void AllReduceHist(int nidx, dh::AllReducer* reducer) {
+  // num histograms is the number of contiguous histograms in memory to reduce over
+  void AllReduceHist(int nidx, dh::AllReducer* reducer, int num_histograms) {
     monitor.Start("AllReduce");
     auto d_node_hist = hist.GetNodeHistogram(nidx).data();
-    reducer->AllReduceSum(
-        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
-        page->Cuts().TotalBins() * (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)));
+    reducer->AllReduceSum(reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+                          reinterpret_cast<typename GradientSumT::ValueT*>(d_node_hist),
+                          page->Cuts().TotalBins() *
+                              (sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT)) *
+                              num_histograms);
 
     monitor.Stop("AllReduce");
   }
@@ -519,33 +530,50 @@ struct GPUHistMakerDevice {
   /**
    * \brief Build GPU local histograms for the left and right child of some parent node
    */
-  void BuildHistLeftRight(const GPUExpandEntry &candidate, int nidx_left,
-        int nidx_right, dh::AllReducer* reducer) {
-    auto build_hist_nidx = nidx_left;
-    auto subtraction_trick_nidx = nidx_right;
-
-    // Decide whether to build the left histogram or right histogram
-    // Use sum of Hessian as a heuristic to select node with fewest training instances
-    bool fewer_right = candidate.split.right_sum.GetHess() < candidate.split.left_sum.GetHess();
-    if (fewer_right) {
-      std::swap(build_hist_nidx, subtraction_trick_nidx);
+  void BuildHistLeftRight(std::vector<GPUExpandEntry> const& candidates, dh::AllReducer* reducer,
+                          const RegTree& tree) {
+    if (candidates.empty()) return;
+    // Some nodes we will manually compute histograms
+    // others we will do by subtraction
+    std::vector<int> hist_nidx;
+    std::vector<int> subtraction_nidx;
+    for (auto& e : candidates) {
+      // Decide whether to build the left histogram or right histogram
+      // Use sum of Hessian as a heuristic to select node with fewest training instances
+      bool fewer_right = e.split.right_sum.GetHess() < e.split.left_sum.GetHess();
+      if (fewer_right) {
+        hist_nidx.emplace_back(tree[e.nid].RightChild());
+        subtraction_nidx.emplace_back(tree[e.nid].LeftChild());
+      } else {
+        hist_nidx.emplace_back(tree[e.nid].LeftChild());
+        subtraction_nidx.emplace_back(tree[e.nid].RightChild());
+      }
+    }
+    std::vector<int> all_new = hist_nidx;
+    all_new.insert(all_new.end(), subtraction_nidx.begin(), subtraction_nidx.end());
+    // Allocate the histograms
+    // Guaranteed contiguous memory
+    hist.AllocateHistograms(all_new);
+
+    for (auto nidx : hist_nidx) {
+      this->BuildHist(nidx);
     }
 
-    this->BuildHist(build_hist_nidx);
-    this->AllReduceHist(build_hist_nidx, reducer);
+    // Reduce all in one go
+    // This gives much better latency in a distributed setting
+    // when processing a large batch
+    this->AllReduceHist(hist_nidx.at(0), reducer, hist_nidx.size());
 
-    // Check whether we can use the subtraction trick to calculate the other
-    bool do_subtraction_trick = this->CanDoSubtractionTrick(
-        candidate.nid, build_hist_nidx, subtraction_trick_nidx);
+    for (int i = 0; i < subtraction_nidx.size(); i++) {
+      auto build_hist_nidx = hist_nidx.at(i);
+      auto subtraction_trick_nidx = subtraction_nidx.at(i);
+      auto parent_nidx = candidates.at(i).nid;
 
-    if (do_subtraction_trick) {
-      // Calculate other histogram using subtraction trick
-      this->SubtractionTrick(candidate.nid, build_hist_nidx,
-                             subtraction_trick_nidx);
-    } else {
-      // Calculate other histogram manually
-      this->BuildHist(subtraction_trick_nidx);
-      this->AllReduceHist(subtraction_trick_nidx, reducer);
+      if (!this->SubtractionTrick(parent_nidx, build_hist_nidx, subtraction_trick_nidx)) {
+        // Calculate other histogram manually
+        this->BuildHist(subtraction_trick_nidx);
+        this->AllReduceHist(subtraction_trick_nidx, reducer, 1);
+      }
     }
   }
 
@@ -605,8 +633,9 @@ struct GPUHistMakerDevice {
                    GradientPairPrecise{}, thrust::plus<GradientPairPrecise>{});
     rabit::Allreduce<rabit::op::Sum, double>(reinterpret_cast<double*>(&root_sum), 2);
 
+    hist.AllocateHistograms({kRootNIdx});
     this->BuildHist(kRootNIdx);
-    this->AllReduceHist(kRootNIdx, reducer);
+    this->AllReduceHist(kRootNIdx, reducer, 1);
 
     // Remember root stats
     node_sum_gradients[kRootNIdx] = root_sum;
@@ -646,6 +675,7 @@ struct GPUHistMakerDevice {
       std::copy_if(expand_set.begin(), expand_set.end(), std::back_inserter(filtered_expand_set),
                    [&](const auto& e) { return driver.IsChildValid(e); });
 
+
       auto new_candidates =
           pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
 
@@ -658,18 +688,12 @@ struct GPUHistMakerDevice {
         monitor.Stop("UpdatePosition");
       }
 
-      for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
-        auto candidate = expand_set.at(i);
-        int left_child_nidx = tree[candidate.nid].LeftChild();
-        int right_child_nidx = tree[candidate.nid].RightChild();
-
-        monitor.Start("BuildHist");
-        this->BuildHistLeftRight(candidate, left_child_nidx, right_child_nidx, reducer);
-        monitor.Stop("BuildHist");
-      }
+      monitor.Start("BuildHist");
+      this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
+      monitor.Stop("BuildHist");
 
       for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
-        auto candidate = expand_set.at(i);
+        auto candidate = filtered_expand_set.at(i);
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
 
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 3b543a48d7cc..75d97b681a61 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -95,7 +95,6 @@ TEST(Histogram, GPUDeterministic) {
   std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
   for (bool is_dense : is_dense_array) {
     for (int shm_size : shm_sizes) {
-      TestDeterministicHistogram<GradientPair>(is_dense, shm_size);
       TestDeterministicHistogram<GradientPairPrecise>(is_dense, shm_size);
     }
   }
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index b3c08736c996..be51d3cc5e31 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -29,29 +29,38 @@ TEST(GpuHist, DeviceHistogram) {
   constexpr size_t kNBins = 128;
   constexpr size_t kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
-  DeviceHistogram<GradientPairPrecise, kStopGrowing> histogram;
+  DeviceHistogramStorage<GradientPairPrecise, kStopGrowing> histogram;
   histogram.Init(0, kNBins);
-  for (size_t i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistogram(i);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms({i});
   }
   histogram.Reset();
   ASSERT_EQ(histogram.Data().size(), kStopGrowing);
 
   // Use allocated memory but do not erase nidx_map.
-  for (size_t i = 0; i < kNNodes; ++i) {
-    histogram.AllocateHistogram(i);
+  for (int i = 0; i < kNNodes; ++i) {
+    histogram.AllocateHistograms({i});
   }
-  for (size_t i = 0; i < kNNodes; ++i) {
+  for (int i = 0; i < kNNodes; ++i) {
     ASSERT_TRUE(histogram.HistogramExists(i));
   }
 
-  // Erase existing nidx_map.
-  for (size_t i = kNNodes; i < kNNodes * 2; ++i) {
-    histogram.AllocateHistogram(i);
-  }
-  for (size_t i = 0; i < kNNodes; ++i) {
-    ASSERT_FALSE(histogram.HistogramExists(i));
+  // Add two new nodes
+  histogram.AllocateHistograms({kNNodes});
+  histogram.AllocateHistograms({kNNodes+1});
+
+  // Old cached nodes should still exist
+  for (int i = 0; i < kNNodes; ++i) {
+    ASSERT_TRUE(histogram.HistogramExists(i));
   }
+
+  // Should be deleted
+  ASSERT_FALSE(histogram.HistogramExists({kNNodes}));
+  // Most recent node should exist
+  ASSERT_TRUE(histogram.HistogramExists({kNNodes + 1}));
+
+  // Add same node again - should fail
+  EXPECT_ANY_THROW(histogram.AllocateHistograms({kNNodes+1}););
 }
 
 std::vector<GradientPairPrecise> GetHostHistGpair() {
@@ -96,9 +105,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
   maker.row_partitioner.reset(new RowPartitioner(0, kNRows));
-  maker.hist.AllocateHistogram(0);
+  maker.hist.AllocateHistograms({0});
   maker.gpair = gpair.DeviceSpan();
-  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);;
+  maker.histogram_rounding = CreateRoundingFactor<GradientSumT>(maker.gpair);
 
   BuildGradientHistogram(
       page->GetDeviceAccessor(0), maker.feature_groups->DeviceAccessor(0),
@@ -106,7 +115,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
       maker.hist.GetNodeHistogram(0), maker.histogram_rounding,
       !use_shared_memory_histograms);
 
-  DeviceHistogram<GradientSumT>& d_hist = maker.hist;
+  DeviceHistogramStorage<GradientSumT>& d_hist = maker.hist;
 
   auto node_histogram = d_hist.GetNodeHistogram(0);
   // d_hist.data stored in float, not gradient pair
@@ -129,12 +138,10 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
 TEST(GpuHist, BuildHistGlobalMem) {
   TestBuildHist<GradientPairPrecise>(false);
-  TestBuildHist<GradientPair>(false);
 }
 
 TEST(GpuHist, BuildHistSharedMem) {
   TestBuildHist<GradientPairPrecise>(true);
-  TestBuildHist<GradientPair>(true);
 }
 
 HistogramCutsWrapper GetHostCutMatrix () {
@@ -198,7 +205,7 @@ TEST(GpuHist, EvaluateRootSplit) {
 
   // Initialize GPUHistMakerDevice::hist
   maker.hist.Init(0, (max_bins - 1) * kNCols);
-  maker.hist.AllocateHistogram(0);
+  maker.hist.AllocateHistograms({0});
   // Each row of hist_gpair represents gpairs for one feature.
   // Each entry represents a bin.
   std::vector<GradientPairPrecise> hist_gpair = GetHostHistGpair();
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 257085b0c8f9..8748ddcbdf91 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -3,7 +3,7 @@
 import gc
 import pytest
 import xgboost as xgb
-from hypothesis import given, strategies, assume, settings, note, reproduce_failure
+from hypothesis import given, strategies, assume, settings, note
 
 sys.path.append("tests/python")
 import testing as tm

From 1dd1a6cc1c74a45dcb986546f0fab753d359c70b Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 10 May 2022 05:07:31 -0700
Subject: [PATCH 11/64] Limit concurrent nodes

---
 src/tree/driver.h                      | 10 ++++++----
 src/tree/updater_gpu_hist.cu           |  5 ++++-
 tests/cpp/tree/gpu_hist/test_driver.cu | 18 +++++++++++++-----
 3 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/src/tree/driver.h b/src/tree/driver.h
index e61255e043c7..0aef93ccf9cd 100644
--- a/src/tree/driver.h
+++ b/src/tree/driver.h
@@ -33,10 +33,11 @@ class Driver {
                           std::function<bool(ExpandEntryT, ExpandEntryT)>>;
 
  public:
-  explicit Driver(TrainParam param)
+  explicit Driver(TrainParam param, std::size_t max_node_batch_size = 256)
       : param_(param),
-        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT> :
-                                                  LossGuide<ExpandEntryT>) {}
+        max_node_batch_size_(max_node_batch_size),
+        queue_(param.grow_policy == TrainParam::kDepthWise ? DepthWise<ExpandEntryT>
+                                                           : LossGuide<ExpandEntryT>) {}
   template <typename EntryIterT>
   void Push(EntryIterT begin, EntryIterT end) {
     for (auto it = begin; it != end; ++it) {
@@ -84,7 +85,7 @@ class Driver {
     std::vector<ExpandEntryT> result;
     ExpandEntryT e = queue_.top();
     int level = e.depth;
-    while (e.depth == level && !queue_.empty()) {
+    while (e.depth == level && !queue_.empty() && result.size() < max_node_batch_size_) {
       queue_.pop();
       if (e.IsValid(param_, num_leaves_)) {
         num_leaves_++;
@@ -101,6 +102,7 @@ class Driver {
  private:
   TrainParam param_;
   std::size_t num_leaves_ = 1;
+  std::size_t max_node_batch_size_;
   ExpandQueue queue_;
 };
 }  // namespace tree
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 964a486baf16..eb10b42fc2fa 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -90,6 +90,7 @@ class DeviceHistogramStorage {
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
   static_assert(kNumItemsInGradientSum == 2,
                 "Number of items in gradient type should be 2.");
+  
 
  public:
   // Start with about 16mb
@@ -206,6 +207,7 @@ struct GPUHistMakerDevice {
 
   std::unique_ptr<FeatureGroups> feature_groups;
 
+
   GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
                      common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
                      TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
@@ -653,7 +655,8 @@ struct GPUHistMakerDevice {
                   RegTree* p_tree, dh::AllReducer* reducer,
                   HostDeviceVector<bst_node_t>* p_out_position) {
     auto& tree = *p_tree;
-    Driver<GPUExpandEntry> driver(param);
+    // Process maximum 32 nodes at a time
+    Driver<GPUExpandEntry> driver(param, 32);
 
     monitor.Start("Reset");
     this->Reset(gpair_all, p_fmat, p_fmat->Info().num_col_);
diff --git a/tests/cpp/tree/gpu_hist/test_driver.cu b/tests/cpp/tree/gpu_hist/test_driver.cu
index d7f8cc63869e..8e7164e37bec 100644
--- a/tests/cpp/tree/gpu_hist/test_driver.cu
+++ b/tests/cpp/tree/gpu_hist/test_driver.cu
@@ -8,8 +8,8 @@ namespace tree {
 TEST(GpuHist, DriverDepthWise) {
   TrainParam p;
   p.InitAllowUnknown(Args{});
-  p.grow_policy=TrainParam::kDepthWise;
-  Driver<GPUExpandEntry> driver(p);
+  p.grow_policy = TrainParam::kDepthWise;
+  Driver<GPUExpandEntry> driver(p, 2);
   EXPECT_TRUE(driver.Pop().empty());
   DeviceSplitCandidate split;
   split.loss_chg = 1.0f;
@@ -20,15 +20,23 @@ TEST(GpuHist, DriverDepthWise) {
   EXPECT_EQ(driver.Pop().front().nid, 0);
   driver.Push({GPUExpandEntry{1, 1, split, 2.0f, 1.0f, 1.0f}});
   driver.Push({GPUExpandEntry{2, 1, split, 2.0f, 1.0f, 1.0f}});
-  driver.Push({GPUExpandEntry{3, 2, split, 2.0f, 1.0f, 1.0f}});
-  // Should return entries from level 1
+  driver.Push({GPUExpandEntry{3, 1, split, 2.0f, 1.0f, 1.0f}});
+  driver.Push({GPUExpandEntry{4, 2, split, 2.0f, 1.0f, 1.0f}});
+  // Should return 2 entries from level 1
+  // as we limited the driver to pop maximum 2 nodes
   auto res = driver.Pop();
   EXPECT_EQ(res.size(), 2);
   for (auto &e : res) {
     EXPECT_EQ(e.depth, 1);
   }
+
+  // Should now return 1 entry from level 1
+  res = driver.Pop();
+  EXPECT_EQ(res.size(), 1);
+  EXPECT_EQ(res.at(0).depth, 1);
+
   res = driver.Pop();
-  EXPECT_EQ(res[0].depth, 2);
+  EXPECT_EQ(res.at(0).depth, 2);
   EXPECT_TRUE(driver.Pop().empty());
 }
 

From 8751d14956d3a85ef0aaef40f223cfe485539973 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 11 May 2022 04:17:36 -0700
Subject: [PATCH 12/64] Lint

---
 src/tree/updater_gpu_hist.cu    | 11 ++++-------
 tests/cpp/tree/test_gpu_hist.cu |  4 ++--
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index eb10b42fc2fa..88978142ee2e 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -88,9 +88,7 @@ class DeviceHistogramStorage {
   int device_id_;
   static constexpr size_t kNumItemsInGradientSum =
       sizeof(GradientSumT) / sizeof(typename GradientSumT::ValueT);
-  static_assert(kNumItemsInGradientSum == 2,
-                "Number of items in gradient type should be 2.");
-  
+  static_assert(kNumItemsInGradientSum == 2, "Number of items in gradient type should be 2.");
 
  public:
   // Start with about 16mb
@@ -107,11 +105,10 @@ class DeviceHistogramStorage {
     overflow_nidx_map_.clear();
   }
   bool HistogramExists(int nidx) const {
-    return nidx_map_.find(nidx) != nidx_map_.cend() || overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
-  }
-  int Bins() const {
-    return n_bins_;
+    return nidx_map_.find(nidx) != nidx_map_.cend() ||
+           overflow_nidx_map_.find(nidx) != overflow_nidx_map_.cend();
   }
+  int Bins() const { return n_bins_; }
   size_t HistogramSize() const { return n_bins_ * kNumItemsInGradientSum; }
   dh::device_vector<typename GradientSumT::ValueT>& Data() { return data_; }
 
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index be51d3cc5e31..7d06d1731c5a 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -27,7 +27,7 @@ TEST(GpuHist, DeviceHistogram) {
   // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
   dh::safe_cuda(cudaSetDevice(0));
   constexpr size_t kNBins = 128;
-  constexpr size_t kNNodes = 4;
+  constexpr int kNNodes = 4;
   constexpr size_t kStopGrowing = kNNodes * kNBins * 2u;
   DeviceHistogramStorage<GradientPairPrecise, kStopGrowing> histogram;
   histogram.Init(0, kNBins);
@@ -47,7 +47,7 @@ TEST(GpuHist, DeviceHistogram) {
 
   // Add two new nodes
   histogram.AllocateHistograms({kNNodes});
-  histogram.AllocateHistograms({kNNodes+1});
+  histogram.AllocateHistograms({kNNodes + 1});
 
   // Old cached nodes should still exist
   for (int i = 0; i < kNNodes; ++i) {

From 49809bf2a700067fd28879a177b0a339d1395944 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 11 May 2022 08:11:02 -0700
Subject: [PATCH 13/64] Basic blockwise partitioning

---
 src/tree/gpu_hist/row_partitioner.cuh         | 98 ++++++++++++++++---
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 43 ++++++++
 2 files changed, 127 insertions(+), 14 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 9470b6447512..2bba8fd51133 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -9,9 +9,52 @@
 #include "xgboost/generic_parameters.h"
 #include "xgboost/task.h"
 #include "xgboost/tree_model.h"
+#include <thrust/execution_policy.h>
 
 namespace xgboost {
 namespace tree {
+
+  /** \brief Used to demarcate a contiguous set of row indices associated with
+ * some tree node. */
+struct Segment {
+  size_t begin{0};
+  size_t end{0};
+
+  Segment() = default;
+
+  Segment(size_t begin, size_t end) : begin(begin), end(end) { CHECK_GE(end, begin); }
+  __host__ __device__ size_t Size() const { return end - begin; }
+};
+
+constexpr int kUpdatePositionMaxBatch = 32;
+struct UpdatePositionBatchArgs {
+  bst_node_t nidx_batch[kUpdatePositionMaxBatch];
+  bst_node_t left_nidx_batch[kUpdatePositionMaxBatch];
+  bst_node_t right_nidx_batch[kUpdatePositionMaxBatch];
+  Segment segments_batch[kUpdatePositionMaxBatch];
+};
+
+template <typename OpT>
+__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args,
+                                          OpT op, common::Span<bst_uint> ridx,
+                                          common::Span<bst_node_t> position,
+                                          common::Span<int64_t> left_counts) {
+  auto segment = args.segments_batch[blockIdx.x];
+  auto ridx_segment = ridx.subspan(segment.begin, segment.Size());
+  auto position_segment = position.subspan(segment.begin, segment.Size());
+  thrust::sort_by_key(thrust::seq, ridx_segment.data(), ridx_segment.data()+ridx_segment.size(),
+                      position_segment.data(), [=] __device__(auto a, auto b) { return op(a) < op(b); });
+
+  auto left_nidx = args.left_nidx_batch[blockIdx.x];
+  int64_t left_count = 0;
+  for (int i = segment.begin; i < segment.end; i++) {
+    bst_node_t new_position = op(ridx[i]);  // new node id
+    left_count += new_position == left_nidx;
+    position[i] = new_position;
+  }
+  left_counts[blockIdx.x] = left_count;
+}
+
 /*! \brief Count how many rows are assigned to left node. */
 __forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment) {
 #if __CUDACC_VER_MAJOR__ > 8
@@ -36,7 +79,6 @@ __forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment
 class RowPartitioner {
  public:
   using RowIndexT = bst_uint;
-  struct Segment;
   static constexpr bst_node_t kIgnoredTreePosition = -1;
 
  private:
@@ -98,6 +140,47 @@ class RowPartitioner {
    */
   std::vector<bst_node_t> GetPositionHost();
 
+  template <typename UpdatePositionOpT>
+  void UpdatePositionBatch(const std::vector<bst_node_t>& nidx,
+                           const std::vector<bst_node_t>& left_nidx,
+                           const std::vector<bst_node_t>& right_nidx, UpdatePositionOpT op) {
+    // Impose this limit because we are passing arguments for each node to the kernel by parameter
+    // this avoids memcpy but we cannot pass arbitrary number of arguments
+    CHECK_EQ(nidx.size(), left_nidx.size());
+    CHECK_EQ(nidx.size(), right_nidx.size());
+    CHECK_LE(nidx.size(), kUpdatePositionMaxBatch);
+    auto left_counts = pinned_.GetSpan<int64_t>(nidx.size(), 0);
+
+
+    // Prepare kernel arguments
+    UpdatePositionBatchArgs args;
+    std::copy(nidx.begin(),nidx.end(),args.nidx_batch);
+    std::copy(left_nidx.begin(),left_nidx.end(),args.left_nidx_batch);
+    std::copy(right_nidx.begin(),right_nidx.end(),args.right_nidx_batch);
+    for(int i = 0; i < nidx.size(); i++){
+      args.segments_batch[i]=ridx_segments_.at(nidx[i]);
+    }
+
+    // 1 block per node
+    UpdatePositionBatchKernel<<<nidx.size(), 1>>>(
+        args, op, ridx_.CurrentSpan(),
+        position_.CurrentSpan(), left_counts);
+
+    dh::safe_cuda(cudaDeviceSynchronize());
+
+    // Update segments
+    for (int i = 0; i < nidx.size(); i++) {
+      auto segment=ridx_segments_.at(nidx[i]);
+      auto left_count = left_counts[i];
+      CHECK_LE(left_count, segment.Size());
+      CHECK_GE(left_count, 0);
+      ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
+                                     std::max(left_nidx[i], right_nidx[i]) + 1));
+      ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count);
+      ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end);
+    }
+  }
+
   /**
    * \brief Updates the tree position for set of training instances being split
    * into left and right child nodes. Accepts a user-defined lambda specifying
@@ -215,19 +298,6 @@ class RowPartitioner {
   void SortPositionAndCopy(const Segment& segment, bst_node_t left_nidx,
                            bst_node_t right_nidx, int64_t* d_left_count,
                            cudaStream_t stream);
-  /** \brief Used to demarcate a contiguous set of row indices associated with
-   * some tree node. */
-  struct Segment {
-    size_t begin { 0 };
-    size_t end { 0 };
-
-    Segment() = default;
-
-    Segment(size_t begin, size_t end) : begin(begin), end(end) {
-      CHECK_GE(end, begin);
-    }
-    size_t Size() const { return end - begin; }
-  };
 };
 };  // namespace tree
 };  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index c8aaf82dcb3e..1e5da8a33e5c 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -106,6 +106,49 @@ void TestUpdatePosition() {
 
 TEST(RowPartitioner, Basic) { TestUpdatePosition(); }
 
+void TestUpdatePositionBatch() {
+  const int kNumRows = 10;
+  RowPartitioner rp(0, kNumRows);
+  auto rows = rp.GetRowsHost(0);
+  EXPECT_EQ(rows.size(), kNumRows);
+  for (auto i = 0ull; i < kNumRows; i++) {
+    EXPECT_EQ(rows[i], i);
+  }
+  // Send the first five training instances to the right node
+  // and the second 5 to the left node
+  rp.UpdatePosition({0}, {1}, {2},
+    [=] __device__(RowPartitioner::RowIndexT ridx) {
+    if (ridx > 4) {
+      return 1;
+    }
+    else {
+      return 2;
+    }
+  });
+  rows = rp.GetRowsHost(1);
+  for (auto r : rows) {
+    EXPECT_GT(r, 4);
+  }
+  rows = rp.GetRowsHost(2);
+  for (auto r : rows) {
+    EXPECT_LT(r, 5);
+  }
+
+  // Split the left node again
+  rp.UpdatePositionBatch({1}, {3}, {4}, [=] __device__(RowPartitioner::RowIndexT ridx) {
+    if (ridx < 7) {
+      return 3;
+    }
+    return 4;
+  });
+  EXPECT_EQ(rp.GetRows(3).size(), 2);
+  EXPECT_EQ(rp.GetRows(4).size(), 3);
+  // Check position is as expected
+  EXPECT_EQ(rp.GetPositionHost(), std::vector<bst_node_t>({3,3,4,4,4,2,2,2,2,2}));
+}
+
+TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
+
 void TestFinalise() {
   const int kNumRows = 10;
 

From 181d7cf2ddf8de96e0144d1378ca2e1b3268c225 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 12 May 2022 03:10:36 -0700
Subject: [PATCH 14/64] Working block partition

---
 tests/cpp/common/test_device_helpers.cu | 129 ++++++++++++++++++++++++
 1 file changed, 129 insertions(+)

diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 6e8668bd2581..b2d28a0b5320 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -4,6 +4,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <thrust/device_vector.h>
+#include <thrust/random.h>
 #include <vector>
 #include <xgboost/base.h>
 #include "../../../src/common/device_helpers.cuh"
@@ -264,4 +265,132 @@ void TestAtomicAdd() {
 TEST(AtomicAdd, Int64) {
   TestAtomicAdd();
 }
+
+template <int kBlockSize>
+class BlockPartition {
+ public:
+  template <typename IterT, typename OpT>
+  __device__ std::size_t Partition(IterT begin, IterT end, OpT op) {
+    typedef cub::BlockScan<std::size_t, kBlockSize> BlockScanT;
+    __shared__ typename BlockScanT::TempStorage temp1, temp2;
+    __shared__ std::size_t lcomp[kBlockSize];
+    __shared__ std::size_t rcomp[kBlockSize];
+
+    // Get left count
+    std::size_t left_count = 0;
+    if (threadIdx.x == 0) {
+      for (int i = 0; i < (end - begin); i++) {
+        left_count += op(begin[i]);
+      }
+      lcomp[0] = left_count;
+    }
+    __syncthreads();
+    left_count = lcomp[0];
+    //
+
+    std::size_t loffset = 0, part = left_count, roffset = part;
+    auto count = end - begin;
+    std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0;
+    auto tid = threadIdx.x;
+    while (loffset < part && roffset < count) {
+      // find the samples in the left that belong to right and vice-versa
+      auto loff = loffset + tid, roff = roffset + tid;
+      if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0;
+      if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0;
+      // scan to compute the locations for each 'misfit' in the two partitions
+      std::size_t lidx, ridx;
+      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
+      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
+      __syncthreads();
+      minlen = llen < rlen ? llen : rlen;
+      // compaction to figure out the right locations to swap
+      if (lflag) lcomp[lidx] = loff;
+      if (rflag) rcomp[ridx] = roff;
+      __syncthreads();
+      // reset the appropriate flags for the longer of the two
+      if (lidx < minlen) lflag = 0;
+      if (ridx < minlen) rflag = 0;
+      if (llen == minlen) loffset += kBlockSize;
+      if (rlen == minlen) roffset += kBlockSize;
+      // swap the 'misfit's
+      if (tid < minlen) {
+        auto a = begin[lcomp[tid]];
+        auto b = begin[rcomp[tid]];
+        begin[lcomp[tid]] = b;
+        begin[rcomp[tid]] = a;
+      }
+    }
+    return left_count;
+  }
+};
+
+template <int kBlockSize, typename OpT>
+__global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) {
+  auto count = BlockPartition<kBlockSize>().Partition(begin, end, op);
+  if (threadIdx.x == 0) {
+    *count_out = count;
+  }
+}
+
+template <int kBlockSize>
+void TestBlockPartition(thrust::device_vector<int>& x) {
+  thrust::device_vector<std::size_t> count(1);
+
+  auto op = [] __device__(int y) { return y % 2 == 0; };
+  TestBlockPartitionKernel<kBlockSize>
+      <<<1, kBlockSize>>>(x.data().get(), x.data().get() + x.size(), count.data().get(), op);
+
+  auto reference = thrust::count_if(x.begin(), x.end(), op);
+  EXPECT_EQ(count[0], reference);
+
+  auto left_partition_count = thrust::count_if(x.begin(), x.begin() + count[0], op);
+  EXPECT_EQ(count[0], left_partition_count);
+  auto right_partition_count = thrust::count_if(x.begin() + count[0], x.end(), op);
+  EXPECT_EQ(0, right_partition_count);
+}
+
+TEST(BlockPartition, BlockPartitionEmpty) {
+  thrust::device_vector<int> x;
+  TestBlockPartition<256>(x);
+}
+
+TEST(BlockPartition, BlockPartitionUniform) {
+  thrust::device_vector<int> x(100);
+  TestBlockPartition<256>(x);
+  thrust::fill(x.begin(),x.end(),1);
+  TestBlockPartition<256>(x);
+}
+
+void MakeRandom(thrust::device_vector<int>& x, int seed) {
+  auto counting = thrust::make_counting_iterator(0);
+  thrust::transform(counting, counting + x.size(), x.begin(), [=] __device__(auto idx) {
+    thrust::default_random_engine gen(seed);
+    thrust::uniform_int_distribution<int> dist;
+    gen.discard(idx);
+    return dist(gen);
+  });
+}
+
+TEST(BlockPartition, BlockPartitionBasic) {
+  thrust::device_vector<int> x = std::vector<int>{0,1,2};
+  TestBlockPartition<256>(x);
+}
+
+TEST(BlockPartition, BlockPartition) {
+  int sizes[] = {1, 37, 1092};
+  int seeds[] = {0, 1, 2, 3, 4};
+  for (auto seed : seeds) {
+    for (auto size : sizes) {
+      thrust::device_vector<int> x(size);
+      MakeRandom(x, seed);
+      thrust::device_vector<int> y = x;
+      TestBlockPartition<1>(y);
+      y = x;
+      TestBlockPartition<1024>(y);
+      y = x;
+      TestBlockPartition<37>(y);
+    }
+  }
+}
+
 }  // namespace xgboost

From 666eb9b42d93e5f1fa49cc11cda2f38d12523414 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 12 May 2022 04:39:24 -0700
Subject: [PATCH 15/64] Reduction

---
 tests/cpp/common/test_device_helpers.cu | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index b2d28a0b5320..16805128b8c2 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -275,21 +275,24 @@ class BlockPartition {
     __shared__ typename BlockScanT::TempStorage temp1, temp2;
     __shared__ std::size_t lcomp[kBlockSize];
     __shared__ std::size_t rcomp[kBlockSize];
+    __shared__ int64_t tmp_sum;
+
+    if (threadIdx.x == 0) {
+      tmp_sum = 0;
+    }
+    __syncthreads();
 
     // Get left count
+    std::size_t count = end - begin;
     std::size_t left_count = 0;
-    if (threadIdx.x == 0) {
-      for (int i = 0; i < (end - begin); i++) {
-        left_count += op(begin[i]);
-      }
-      lcomp[0] = left_count;
+    for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) {
+      left_count += op(begin[idx]);
     }
+    atomicAdd(&tmp_sum, left_count);
     __syncthreads();
-    left_count = lcomp[0];
-    //
+    left_count = tmp_sum;
 
     std::size_t loffset = 0, part = left_count, roffset = part;
-    auto count = end - begin;
     std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0;
     auto tid = threadIdx.x;
     while (loffset < part && roffset < count) {

From 66173c74fa674b899e7727aebc15e6f7b87a06e7 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 13 May 2022 04:30:24 -0700
Subject: [PATCH 16/64] Some failing tests

---
 src/common/device_helpers.cuh                 | 61 ++++++++++++++++++
 src/tree/gpu_hist/row_partitioner.cuh         | 31 +++++-----
 src/tree/updater_gpu_hist.cu                  | 62 ++++++++++++-------
 tests/cpp/common/test_device_helpers.cu       | 62 +------------------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  8 +--
 5 files changed, 120 insertions(+), 104 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 334e3b4f89bf..20cb951e8805 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1639,4 +1639,65 @@ class CUDAStream {
   CUDAStreamView View() const { return CUDAStreamView{stream_}; }
   void Sync() { this->View().Sync(); }
 };
+
+template <int kBlockSize>
+class BlockPartition {
+ public:
+  template <typename IterT, typename OpT>
+  __device__ std::size_t Partition(IterT begin, IterT end, OpT op) {
+    typedef cub::BlockScan<std::size_t, kBlockSize> BlockScanT;
+    __shared__ typename BlockScanT::TempStorage temp1, temp2;
+    __shared__ std::size_t lcomp[kBlockSize];
+    __shared__ std::size_t rcomp[kBlockSize];
+    __shared__ unsigned long long int tmp_sum;
+
+    if (threadIdx.x == 0) {
+      tmp_sum = 0;
+    }
+    __syncthreads();
+
+    // Get left count
+    std::size_t count = end - begin;
+    std::size_t left_count = 0;
+    for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) {
+      left_count += op(begin[idx]);
+    }
+    atomicAdd(&tmp_sum, left_count);
+    __syncthreads();
+    left_count = tmp_sum;
+
+    std::size_t loffset = 0, part = left_count, roffset = part;
+    std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0;
+    auto tid = threadIdx.x;
+    while (loffset < part && roffset < count) {
+      // find the samples in the left that belong to right and vice-versa
+      auto loff = loffset + tid, roff = roffset + tid;
+      if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0;
+      if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0;
+      // scan to compute the locations for each 'misfit' in the two partitions
+      std::size_t lidx, ridx;
+      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
+      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
+      __syncthreads();
+      minlen = llen < rlen ? llen : rlen;
+      // compaction to figure out the right locations to swap
+      if (lflag) lcomp[lidx] = loff;
+      if (rflag) rcomp[ridx] = roff;
+      __syncthreads();
+      // reset the appropriate flags for the longer of the two
+      if (lidx < minlen) lflag = 0;
+      if (ridx < minlen) rflag = 0;
+      if (llen == minlen) loffset += kBlockSize;
+      if (rlen == minlen) roffset += kBlockSize;
+      // swap the 'misfit's
+      if (tid < minlen) {
+        auto a = begin[lcomp[tid]];
+        auto b = begin[rcomp[tid]];
+        begin[lcomp[tid]] = b;
+        begin[rcomp[tid]] = a;
+      }
+    }
+    return left_count;
+  }
+};
 }  // namespace dh
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 2bba8fd51133..7e5cbf90d9b1 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -27,32 +27,32 @@ struct Segment {
 };
 
 constexpr int kUpdatePositionMaxBatch = 32;
+template <typename OpDataT>
 struct UpdatePositionBatchArgs {
   bst_node_t nidx_batch[kUpdatePositionMaxBatch];
   bst_node_t left_nidx_batch[kUpdatePositionMaxBatch];
   bst_node_t right_nidx_batch[kUpdatePositionMaxBatch];
   Segment segments_batch[kUpdatePositionMaxBatch];
+  OpDataT data_batch[kUpdatePositionMaxBatch];
 };
 
-template <typename OpT>
-__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs args,
+template <int kBlockSize, typename OpDataT, typename OpT>
+__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs<OpDataT> args,
                                           OpT op, common::Span<bst_uint> ridx,
                                           common::Span<bst_node_t> position,
                                           common::Span<int64_t> left_counts) {
   auto segment = args.segments_batch[blockIdx.x];
+  auto data = args.data_batch[blockIdx.x];
   auto ridx_segment = ridx.subspan(segment.begin, segment.Size());
   auto position_segment = position.subspan(segment.begin, segment.Size());
-  thrust::sort_by_key(thrust::seq, ridx_segment.data(), ridx_segment.data()+ridx_segment.size(),
-                      position_segment.data(), [=] __device__(auto a, auto b) { return op(a) < op(b); });
 
   auto left_nidx = args.left_nidx_batch[blockIdx.x];
-  int64_t left_count = 0;
-  for (int i = segment.begin; i < segment.end; i++) {
-    bst_node_t new_position = op(ridx[i]);  // new node id
-    left_count += new_position == left_nidx;
-    position[i] = new_position;
+  auto left_count = dh::BlockPartition<kBlockSize>().Partition(
+      ridx_segment.begin(), ridx_segment.end(), [=] __device__(auto e) { return op(e, data) == left_nidx; });
+
+  if (threadIdx.x == 0) {
+    left_counts[blockIdx.x] = left_count;
   }
-  left_counts[blockIdx.x] = left_count;
 }
 
 /*! \brief Count how many rows are assigned to left node. */
@@ -140,10 +140,11 @@ class RowPartitioner {
    */
   std::vector<bst_node_t> GetPositionHost();
 
-  template <typename UpdatePositionOpT>
+  template <typename UpdatePositionOpT, typename OpDataT>
   void UpdatePositionBatch(const std::vector<bst_node_t>& nidx,
                            const std::vector<bst_node_t>& left_nidx,
-                           const std::vector<bst_node_t>& right_nidx, UpdatePositionOpT op) {
+                           const std::vector<bst_node_t>& right_nidx,
+                           const std::vector<OpDataT>& op_data, UpdatePositionOpT op) {
     // Impose this limit because we are passing arguments for each node to the kernel by parameter
     // this avoids memcpy but we cannot pass arbitrary number of arguments
     CHECK_EQ(nidx.size(), left_nidx.size());
@@ -153,16 +154,18 @@ class RowPartitioner {
 
 
     // Prepare kernel arguments
-    UpdatePositionBatchArgs args;
+    UpdatePositionBatchArgs<OpDataT> args;
     std::copy(nidx.begin(),nidx.end(),args.nidx_batch);
     std::copy(left_nidx.begin(),left_nidx.end(),args.left_nidx_batch);
     std::copy(right_nidx.begin(),right_nidx.end(),args.right_nidx_batch);
+    std::copy(op_data.begin(),op_data.end(),args.data_batch);
     for(int i = 0; i < nidx.size(); i++){
       args.segments_batch[i]=ridx_segments_.at(nidx[i]);
     }
 
     // 1 block per node
-    UpdatePositionBatchKernel<<<nidx.size(), 1>>>(
+    constexpr int kBlockSize = 512;
+    UpdatePositionBatchKernel<kBlockSize><<<nidx.size(), kBlockSize>>>(
         args, op, ridx_.CurrentSpan(),
         position_.CurrentSpan(), left_counts);
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 88978142ee2e..f497c84726e2 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -356,33 +356,49 @@ struct GPUHistMakerDevice {
     return true;
   }
 
-  void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
-    RegTree::Node split_node = (*p_tree)[e.nid];
-    auto split_type = p_tree->NodeSplitType(e.nid);
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
-    auto node_cats = e.split.split_cats.Bits();
+  // Extra data for each node that is used
+  // in the update position function
+  struct NodeSplitData {
+    RegTree::Node split_node;
+    FeatureType split_type;
+    common::CatBitField node_cats;
+  };
+
+  void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
+    std::vector<int> nidx(candidates.size());
+    std::vector<int> left_nidx(candidates.size());
+    std::vector<int> right_nidx(candidates.size());
+    std::vector<NodeSplitData> split_data(candidates.size());
+    for (int i = 0; i < candidates.size(); i++) {
+      auto& e = candidates[i];
+      RegTree::Node split_node = (*p_tree)[e.nid];
+      auto split_type = p_tree->NodeSplitType(e.nid);
+      nidx.at(i) = e.nid;
+      left_nidx.at(i) = split_node.LeftChild();
+      right_nidx.at(i) = split_node.RightChild();
+      split_data.at(i) = NodeSplitData{ split_node, split_type, e.split.split_cats };
+    }
 
-    row_partitioner->UpdatePosition(
-        e.nid, split_node.LeftChild(), split_node.RightChild(),
-        [=] __device__(bst_uint ridx) {
+    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    row_partitioner->UpdatePositionBatch(
+        nidx, left_nidx, right_nidx, split_data, [=] __device__(bst_uint ridx, const NodeSplitData& data) {
           // given a row index, returns the node id it belongs to
-          bst_float cut_value =
-              d_matrix.GetFvalue(ridx, split_node.SplitIndex());
+          bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
           // Missing value
           bst_node_t new_position = 0;
           if (isnan(cut_value)) {
-            new_position = split_node.DefaultChild();
+            new_position = data.split_node.DefaultChild();
           } else {
             bool go_left = true;
-            if (split_type == FeatureType::kCategorical) {
-              go_left = common::Decision<false>(node_cats, cut_value, split_node.DefaultLeft());
+            if (data.split_type == FeatureType::kCategorical) {
+              go_left = common::Decision<false>(data.node_cats.Bits(), cut_value, data.split_node.DefaultLeft());
             } else {
-              go_left = cut_value <= split_node.SplitCond();
+              go_left = cut_value <= data.split_node.SplitCond();
             }
             if (go_left) {
-              new_position = split_node.LeftChild();
+              new_position = data.split_node.LeftChild();
             } else {
-              new_position = split_node.RightChild();
+              new_position = data.split_node.RightChild();
             }
           }
           return new_position;
@@ -679,14 +695,12 @@ struct GPUHistMakerDevice {
       auto new_candidates =
           pinned.GetSpan<GPUExpandEntry>(filtered_expand_set.size() * 2, GPUExpandEntry());
 
-      for (const auto& e : filtered_expand_set) {
-        monitor.Start("UpdatePosition");
-        // Update position is only run when child is valid, instead of right after apply
-        // split (as in approx tree method).  Hense we have the finalise position call
-        // in GPU Hist.
-        this->UpdatePosition(e, p_tree);
-        monitor.Stop("UpdatePosition");
-      }
+      monitor.Start("UpdatePosition");
+      // Update position is only run when child is valid, instead of right after apply
+      // split (as in approx tree method).  Hense we have the finalise position call
+      // in GPU Hist.
+      this->UpdatePosition(filtered_expand_set, p_tree);
+      monitor.Stop("UpdatePosition");
 
       monitor.Start("BuildHist");
       this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 16805128b8c2..ec9d3af45a45 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -266,70 +266,10 @@ TEST(AtomicAdd, Int64) {
   TestAtomicAdd();
 }
 
-template <int kBlockSize>
-class BlockPartition {
- public:
-  template <typename IterT, typename OpT>
-  __device__ std::size_t Partition(IterT begin, IterT end, OpT op) {
-    typedef cub::BlockScan<std::size_t, kBlockSize> BlockScanT;
-    __shared__ typename BlockScanT::TempStorage temp1, temp2;
-    __shared__ std::size_t lcomp[kBlockSize];
-    __shared__ std::size_t rcomp[kBlockSize];
-    __shared__ int64_t tmp_sum;
-
-    if (threadIdx.x == 0) {
-      tmp_sum = 0;
-    }
-    __syncthreads();
-
-    // Get left count
-    std::size_t count = end - begin;
-    std::size_t left_count = 0;
-    for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) {
-      left_count += op(begin[idx]);
-    }
-    atomicAdd(&tmp_sum, left_count);
-    __syncthreads();
-    left_count = tmp_sum;
-
-    std::size_t loffset = 0, part = left_count, roffset = part;
-    std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0;
-    auto tid = threadIdx.x;
-    while (loffset < part && roffset < count) {
-      // find the samples in the left that belong to right and vice-versa
-      auto loff = loffset + tid, roff = roffset + tid;
-      if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0;
-      if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0;
-      // scan to compute the locations for each 'misfit' in the two partitions
-      std::size_t lidx, ridx;
-      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
-      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
-      __syncthreads();
-      minlen = llen < rlen ? llen : rlen;
-      // compaction to figure out the right locations to swap
-      if (lflag) lcomp[lidx] = loff;
-      if (rflag) rcomp[ridx] = roff;
-      __syncthreads();
-      // reset the appropriate flags for the longer of the two
-      if (lidx < minlen) lflag = 0;
-      if (ridx < minlen) rflag = 0;
-      if (llen == minlen) loffset += kBlockSize;
-      if (rlen == minlen) roffset += kBlockSize;
-      // swap the 'misfit's
-      if (tid < minlen) {
-        auto a = begin[lcomp[tid]];
-        auto b = begin[rcomp[tid]];
-        begin[lcomp[tid]] = b;
-        begin[rcomp[tid]] = a;
-      }
-    }
-    return left_count;
-  }
-};
 
 template <int kBlockSize, typename OpT>
 __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) {
-  auto count = BlockPartition<kBlockSize>().Partition(begin, end, op);
+  auto count = dh::BlockPartition<kBlockSize>().Partition(begin, end, op);
   if (threadIdx.x == 0) {
     *count_out = count;
   }
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 1e5da8a33e5c..2314a622f9e1 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -114,10 +114,10 @@ void TestUpdatePositionBatch() {
   for (auto i = 0ull; i < kNumRows; i++) {
     EXPECT_EQ(rows[i], i);
   }
+  std::vector<int> extra_data = {0};
   // Send the first five training instances to the right node
   // and the second 5 to the left node
-  rp.UpdatePosition({0}, {1}, {2},
-    [=] __device__(RowPartitioner::RowIndexT ridx) {
+  rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) {
     if (ridx > 4) {
       return 1;
     }
@@ -135,7 +135,7 @@ void TestUpdatePositionBatch() {
   }
 
   // Split the left node again
-  rp.UpdatePositionBatch({1}, {3}, {4}, [=] __device__(RowPartitioner::RowIndexT ridx) {
+  rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) {
     if (ridx < 7) {
       return 3;
     }
@@ -143,8 +143,6 @@ void TestUpdatePositionBatch() {
   });
   EXPECT_EQ(rp.GetRows(3).size(), 2);
   EXPECT_EQ(rp.GetRows(4).size(), 3);
-  // Check position is as expected
-  EXPECT_EQ(rp.GetPositionHost(), std::vector<bst_node_t>({3,3,4,4,4,2,2,2,2,2}));
 }
 
 TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }

From ec7fea889a83f9ca479b7f4f67b7302497007dd1 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 13 May 2022 05:41:20 -0700
Subject: [PATCH 17/64] Handle empty candidate

---
 src/tree/gpu_hist/row_partitioner.cuh | 3 ++-
 src/tree/updater_gpu_hist.cu          | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 7e5cbf90d9b1..9fb8635d0b57 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -145,14 +145,15 @@ class RowPartitioner {
                            const std::vector<bst_node_t>& left_nidx,
                            const std::vector<bst_node_t>& right_nidx,
                            const std::vector<OpDataT>& op_data, UpdatePositionOpT op) {
+    if (nidx.empty()) return;
     // Impose this limit because we are passing arguments for each node to the kernel by parameter
     // this avoids memcpy but we cannot pass arbitrary number of arguments
     CHECK_EQ(nidx.size(), left_nidx.size());
     CHECK_EQ(nidx.size(), right_nidx.size());
+    CHECK_EQ(nidx.size(), op_data.size());
     CHECK_LE(nidx.size(), kUpdatePositionMaxBatch);
     auto left_counts = pinned_.GetSpan<int64_t>(nidx.size(), 0);
 
-
     // Prepare kernel arguments
     UpdatePositionBatchArgs<OpDataT> args;
     std::copy(nidx.begin(),nidx.end(),args.nidx_batch);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index f497c84726e2..3905cd233aac 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -356,8 +356,8 @@ struct GPUHistMakerDevice {
     return true;
   }
 
-  // Extra data for each node that is used
-  // in the update position function
+  // Extra data for each node that is passed
+  // to the update position function
   struct NodeSplitData {
     RegTree::Node split_node;
     FeatureType split_type;
@@ -365,6 +365,7 @@ struct GPUHistMakerDevice {
   };
 
   void UpdatePosition(const std::vector<GPUExpandEntry>& candidates, RegTree* p_tree) {
+    if (candidates.empty()) return;
     std::vector<int> nidx(candidates.size());
     std::vector<int> left_nidx(candidates.size());
     std::vector<int> right_nidx(candidates.size());

From 49c5f90aaa300bb3d844801b2d7bf2167d740108 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 13 May 2022 06:03:17 -0700
Subject: [PATCH 18/64] Cleanup

---
 src/tree/gpu_hist/row_partitioner.cu          | 119 +-----------------
 src/tree/gpu_hist/row_partitioner.cuh         | 116 ++---------------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  98 ---------------
 3 files changed, 14 insertions(+), 319 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 9e002f77b64c..8fbded53f913 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -10,73 +10,6 @@
 
 namespace xgboost {
 namespace tree {
-struct IndexFlagTuple {
-  size_t idx;
-  size_t flag;
-};
-
-struct IndexFlagOp {
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a,
-                                       const IndexFlagTuple& b) const {
-    return {b.idx, a.flag + b.flag};
-  }
-};
-
-struct WriteResultsFunctor {
-  bst_node_t left_nidx;
-  common::Span<bst_node_t> position_in;
-  common::Span<bst_node_t> position_out;
-  common::Span<RowPartitioner::RowIndexT> ridx_in;
-  common::Span<RowPartitioner::RowIndexT> ridx_out;
-  int64_t* d_left_count;
-
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
-    // the ex_scan_result represents how many rows have been assigned to left
-    // node so far during scan.
-    int scatter_address;
-    if (position_in[x.idx] == left_nidx) {
-      scatter_address = x.flag - 1;  // -1 because inclusive scan
-    } else {
-      // current number of rows belong to right node + total number of rows
-      // belong to left node
-      scatter_address = (x.idx - x.flag) + *d_left_count;
-    }
-    // copy the node id to output
-    position_out[scatter_address] = position_in[x.idx];
-    ridx_out[scatter_address] = ridx_in[x.idx];
-
-    // Discard
-    return {};
-  }
-};
-
-// Implement partitioning via single scan operation using transform output to
-// write the result
-void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
-                                  common::Span<bst_node_t> position_out,
-                                  common::Span<RowIndexT> ridx,
-                                  common::Span<RowIndexT> ridx_out,
-                                  bst_node_t left_nidx, bst_node_t,
-                                  int64_t* d_left_count, cudaStream_t stream) {
-  WriteResultsFunctor write_results{left_nidx, position, position_out,
-                                    ridx,      ridx_out, d_left_count};
-  auto discard_write_iterator =
-      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
-  auto counting = thrust::make_counting_iterator(0llu);
-  auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
-      counting, [=] __device__(size_t idx) {
-        return IndexFlagTuple{idx, static_cast<size_t>(position[idx] == left_nidx)};
-      });
-  size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(),
-                                 position.size(), stream);
-  dh::TemporaryArray<int8_t> temp(temp_bytes);
-  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(),
-                                 position.size(), stream);
-}
-
 void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
            common::Span<bst_node_t> position) {
   CHECK_EQ(ridx.size(), position.size());
@@ -87,26 +20,11 @@ void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
 }
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_a_(num_rows), position_a_(num_rows),
-      ridx_b_(num_rows), position_b_(num_rows) {
+    : device_idx_(device_idx), ridx_(num_rows), position_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
-  ridx_ = dh::DoubleBuffer<RowIndexT>{&ridx_a_, &ridx_b_};
-  position_ = dh::DoubleBuffer<bst_node_t>{&position_a_, &position_b_};
   ridx_segments_.emplace_back(Segment(0, num_rows));
 
-  Reset(device_idx, ridx_.CurrentSpan(), position_.CurrentSpan());
-  left_counts_.resize(256);
-  thrust::fill(left_counts_.begin(), left_counts_.end(), 0);
-  streams_.resize(2);
-  for (auto& stream : streams_) {
-    dh::safe_cuda(cudaStreamCreate(&stream));
-  }
-}
-RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_));
-  for (auto& stream : streams_) {
-    dh::safe_cuda(cudaStreamDestroy(stream));
-  }
+  Reset(device_idx, dh::ToSpan(ridx_), dh::ToSpan(position_));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
@@ -117,15 +35,15 @@ common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
   if (segment.Size() == 0) {
     return {};
   }
-  return ridx_.CurrentSpan().subspan(segment.begin, segment.Size());
+  return dh::ToSpan(ridx_).subspan(segment.begin, segment.Size());
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows() {
-  return ridx_.CurrentSpan();
+  return dh::ToSpan(ridx_);
 }
 
 common::Span<const bst_node_t> RowPartitioner::GetPosition() {
-  return position_.CurrentSpan();
+  return dh::ToSpan(position_);
 }
 std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(
     bst_node_t nidx) {
@@ -142,32 +60,5 @@ std::vector<bst_node_t> RowPartitioner::GetPositionHost() {
   return position;
 }
 
-void RowPartitioner::SortPositionAndCopy(const Segment& segment,
-                                         bst_node_t left_nidx,
-                                         bst_node_t right_nidx,
-                                         int64_t* d_left_count,
-                                         cudaStream_t stream) {
-  SortPosition(
-      // position_in
-      common::Span<bst_node_t>(position_.Current() + segment.begin,
-                               segment.Size()),
-      // position_out
-      common::Span<bst_node_t>(position_.Other() + segment.begin,
-                               segment.Size()),
-      // row index in
-      common::Span<RowIndexT>(ridx_.Current() + segment.begin, segment.Size()),
-      // row index out
-      common::Span<RowIndexT>(ridx_.Other() + segment.begin, segment.Size()),
-      left_nidx, right_nidx, d_left_count, stream);
-  // Copy back key/value
-  const auto d_position_current = position_.Current() + segment.begin;
-  const auto d_position_other = position_.Other() + segment.begin;
-  const auto d_ridx_current = ridx_.Current() + segment.begin;
-  const auto d_ridx_other = ridx_.Other() + segment.begin;
-  dh::LaunchN(segment.Size(), stream, [=] __device__(size_t idx) {
-    d_position_current[idx] = d_position_other[idx];
-    d_ridx_current[idx] = d_ridx_other[idx];
-  });
-}
 };  // namespace tree
 };  // namespace xgboost
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 9fb8635d0b57..c6b9b763ff1d 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -55,25 +55,6 @@ __global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs<OpDataT> args,
   }
 }
 
-/*! \brief Count how many rows are assigned to left node. */
-__forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment) {
-#if __CUDACC_VER_MAJOR__ > 8
-  int mask = __activemask();
-  unsigned ballot = __ballot_sync(mask, increment);
-  int leader = __ffs(mask) - 1;
-  if (threadIdx.x % 32 == leader) {
-    atomicAdd(reinterpret_cast<unsigned long long*>(d_count),    // NOLINT
-              static_cast<unsigned long long>(__popc(ballot)));  // NOLINT
-  }
-#else
-  unsigned ballot = __ballot(increment);
-  if (threadIdx.x % 32 == 0) {
-    atomicAdd(reinterpret_cast<unsigned long long*>(d_count),    // NOLINT
-              static_cast<unsigned long long>(__popc(ballot)));  // NOLINT
-  }
-#endif
-}
-
 /** \brief Class responsible for tracking subsets of rows as we add splits and
  * partition training rows into different leaf nodes. */
 class RowPartitioner {
@@ -92,26 +73,18 @@ class RowPartitioner {
    */
   /*! \brief Range of row index for each node, pointers into ridx below. */
   std::vector<Segment> ridx_segments_;
-  dh::TemporaryArray<RowIndexT> ridx_a_;
-  dh::TemporaryArray<RowIndexT> ridx_b_;
-  dh::TemporaryArray<bst_node_t> position_a_;
-  dh::TemporaryArray<bst_node_t> position_b_;
   /*! \brief mapping for node id -> rows.
    * This looks like:
    * node id  |    1    |    2   |
    * rows idx | 3, 5, 1 | 13, 31 |
    */
-  dh::DoubleBuffer<RowIndexT> ridx_;
+  dh::TemporaryArray<RowIndexT> ridx_;
   /*! \brief mapping for row -> node id. */
-  dh::DoubleBuffer<bst_node_t> position_;
-  dh::caching_device_vector<int64_t>
-      left_counts_;  // Useful to keep a bunch of zeroed memory for sort position
-  std::vector<cudaStream_t> streams_;
+  dh::TemporaryArray<bst_node_t> position_;
   dh::PinnedMemory pinned_;
 
  public:
   RowPartitioner(int device_idx, size_t num_rows);
-  ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
 
@@ -167,8 +140,8 @@ class RowPartitioner {
     // 1 block per node
     constexpr int kBlockSize = 512;
     UpdatePositionBatchKernel<kBlockSize><<<nidx.size(), kBlockSize>>>(
-        args, op, ridx_.CurrentSpan(),
-        position_.CurrentSpan(), left_counts);
+        args, op, dh::ToSpan(ridx_),
+        dh::ToSpan(position_), left_counts);
 
     dh::safe_cuda(cudaDeviceSynchronize());
 
@@ -185,60 +158,6 @@ class RowPartitioner {
     }
   }
 
-  /**
-   * \brief Updates the tree position for set of training instances being split
-   * into left and right child nodes. Accepts a user-defined lambda specifying
-   * which branch each training instance should go down.
-   *
-   * \tparam  UpdatePositionOpT
-   * \param nidx        The index of the node being split.
-   * \param left_nidx   The left child index.
-   * \param right_nidx  The right child index.
-   * \param op          Device lambda. Should provide the row index as an
-   * argument and return the new position for this training instance.
-   */
-  template <typename UpdatePositionOpT>
-  void UpdatePosition(bst_node_t nidx, bst_node_t left_nidx,
-                      bst_node_t right_nidx, UpdatePositionOpT op) {
-    Segment segment = ridx_segments_.at(nidx);  // rows belongs to node nidx
-    auto d_ridx = ridx_.CurrentSpan();
-    auto d_position = position_.CurrentSpan();
-    if (left_counts_.size() <= nidx) {
-      left_counts_.resize((nidx * 2) + 1);
-      thrust::fill(left_counts_.begin(), left_counts_.end(), 0);
-    }
-    // Now we divide the row segment into left and right node.
-
-    int64_t* d_left_count = left_counts_.data().get() + nidx;
-    // Launch 1 thread for each row
-    dh::LaunchN<1, 128>(segment.Size(), [segment, op, left_nidx, right_nidx, d_ridx, d_left_count,
-                                         d_position] __device__(size_t idx) {
-      // LaunchN starts from zero, so we restore the row index by adding segment.begin
-      idx += segment.begin;
-      RowIndexT ridx = d_ridx[idx];
-      bst_node_t new_position = op(ridx);  // new node id
-      KERNEL_CHECK(new_position == left_nidx || new_position == right_nidx);
-      AtomicIncrement(d_left_count, new_position == left_nidx);
-      d_position[idx] = new_position;
-    });
-    // Overlap device to host memory copy (left_count) with sort
-    int64_t &left_count = pinned_.GetSpan<int64_t>(1)[0];
-    dh::safe_cuda(cudaMemcpyAsync(&left_count, d_left_count, sizeof(int64_t),
-                                  cudaMemcpyDeviceToHost, streams_[0]));
-
-    SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]);
-
-    dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
-    CHECK_LE(left_count, segment.Size());
-    CHECK_GE(left_count, 0);
-    ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
-                                   std::max(left_nidx, right_nidx) + 1));
-    ridx_segments_[left_nidx] =
-        Segment(segment.begin, segment.begin + left_count);
-    ridx_segments_[right_nidx] =
-        Segment(segment.begin + left_count, segment.end);
-  }
-
   /**
    * \brief Finalise the position of all training instances after tree construction is
    * complete. Does not update any other meta information in this data structure, so
@@ -256,10 +175,10 @@ class RowPartitioner {
   void FinalisePosition(Context const* ctx, ObjInfo task,
                         HostDeviceVector<bst_node_t>* p_out_position, FinalisePositionOpT op,
                         Sampledp sampledp) {
-    auto d_position = position_.Current();
-    const auto d_ridx = ridx_.Current();
+    auto d_position = position_.data().get();
+    const auto d_ridx = ridx_.data().get();
     if (!task.UpdateTreeLeaf()) {
-      dh::LaunchN(position_.Size(), [=] __device__(size_t idx) {
+      dh::LaunchN(position_.size(), [=] __device__(size_t idx) {
         auto position = d_position[idx];
         RowIndexT ridx = d_ridx[idx];
         bst_node_t new_position = op(ridx, position);
@@ -272,9 +191,9 @@ class RowPartitioner {
     }
 
     p_out_position->SetDevice(ctx->gpu_id);
-    p_out_position->Resize(position_.Size());
+    p_out_position->Resize(position_.size());
     auto sorted_position = p_out_position->DevicePointer();
-    dh::LaunchN(position_.Size(), [=] __device__(size_t idx) {
+    dh::LaunchN(position_.size(), [=] __device__(size_t idx) {
       auto position = d_position[idx];
       RowIndexT ridx = d_ridx[idx];
       bst_node_t new_position = op(ridx, position);
@@ -285,23 +204,6 @@ class RowPartitioner {
       d_position[idx] = new_position;
     });
   }
-
-  /**
-   * \brief Optimised routine for sorting key value pairs into left and right
-   * segments. Based on a single pass of exclusive scan, uses iterators to
-   * redirect inputs and outputs.
-   */
-  void SortPosition(common::Span<bst_node_t> position,
-                    common::Span<bst_node_t> position_out,
-                    common::Span<RowIndexT> ridx,
-                    common::Span<RowIndexT> ridx_out, bst_node_t left_nidx,
-                    bst_node_t right_nidx, int64_t* d_left_count,
-                    cudaStream_t stream = nullptr);
-
-  /*! \brief Sort row indices according to position. */
-  void SortPositionAndCopy(const Segment& segment, bst_node_t left_nidx,
-                           bst_node_t right_nidx, int64_t* d_left_count,
-                           cudaStream_t stream);
 };
 };  // namespace tree
 };  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 2314a622f9e1..e4e5c9dacb60 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -19,93 +19,6 @@
 namespace xgboost {
 namespace tree {
 
-void TestSortPosition(const std::vector<int>& position_in, int left_idx,
-                      int right_idx) {
-  dh::safe_cuda(cudaSetDevice(0));
-  std::vector<int64_t> left_count = {
-      std::count(position_in.begin(), position_in.end(), left_idx)};
-  dh::caching_device_vector<int64_t> d_left_count = left_count;
-  dh::caching_device_vector<int> position = position_in;
-  dh::caching_device_vector<int> position_out(position.size());
-
-  dh::caching_device_vector<RowPartitioner::RowIndexT> ridx(position.size());
-  thrust::sequence(ridx.begin(), ridx.end());
-  dh::caching_device_vector<RowPartitioner::RowIndexT> ridx_out(ridx.size());
-  RowPartitioner rp(0,10);
-  rp.SortPosition(
-      common::Span<int>(position.data().get(), position.size()),
-      common::Span<int>(position_out.data().get(), position_out.size()),
-      common::Span<RowPartitioner::RowIndexT>(ridx.data().get(), ridx.size()),
-      common::Span<RowPartitioner::RowIndexT>(ridx_out.data().get(), ridx_out.size()), left_idx,
-      right_idx, d_left_count.data().get(), nullptr);
-  thrust::host_vector<int> position_result = position_out;
-  thrust::host_vector<int> ridx_result = ridx_out;
-
-  // Check position is sorted
-  EXPECT_TRUE(std::is_sorted(position_result.begin(), position_result.end()));
-  // Check row indices are sorted inside left and right segment
-  EXPECT_TRUE(
-      std::is_sorted(ridx_result.begin(), ridx_result.begin() + left_count[0]));
-  EXPECT_TRUE(
-      std::is_sorted(ridx_result.begin() + left_count[0], ridx_result.end()));
-
-  // Check key value pairs are the same
-  for (auto i = 0ull; i < ridx_result.size(); i++) {
-    EXPECT_EQ(position_result[i], position_in[ridx_result[i]]);
-  }
-}
-TEST(GpuHist, SortPosition) {
-  TestSortPosition({1, 2, 1, 2, 1}, 1, 2);
-  TestSortPosition({1, 1, 1, 1}, 1, 2);
-  TestSortPosition({2, 2, 2, 2}, 1, 2);
-  TestSortPosition({1, 2, 1, 2, 3}, 1, 2);
-}
-
-void TestUpdatePosition() {
-  const int kNumRows = 10;
-  RowPartitioner rp(0, kNumRows);
-  auto rows = rp.GetRowsHost(0);
-  EXPECT_EQ(rows.size(), kNumRows);
-  for (auto i = 0ull; i < kNumRows; i++) {
-    EXPECT_EQ(rows[i], i);
-  }
-  // Send the first five training instances to the right node
-  // and the second 5 to the left node
-  rp.UpdatePosition(0, 1, 2,
-    [=] __device__(RowPartitioner::RowIndexT ridx) {
-    if (ridx > 4) {
-      return 1;
-    }
-    else {
-      return 2;
-    }
-  });
-  rows = rp.GetRowsHost(1);
-  for (auto r : rows) {
-    EXPECT_GT(r, 4);
-  }
-  rows = rp.GetRowsHost(2);
-  for (auto r : rows) {
-    EXPECT_LT(r, 5);
-  }
-
-  // Split the left node again
-  rp.UpdatePosition(1, 3, 4, [=]__device__(RowPartitioner::RowIndexT ridx)
-  {
-    if (ridx < 7) {
-      return 3
-        ;
-    }
-    return 4;
-  });
-  EXPECT_EQ(rp.GetRows(3).size(), 2);
-  EXPECT_EQ(rp.GetRows(4).size(), 3);
-  // Check position is as expected
-  EXPECT_EQ(rp.GetPositionHost(), std::vector<bst_node_t>({3,3,4,4,4,2,2,2,2,2}));
-}
-
-TEST(RowPartitioner, Basic) { TestUpdatePosition(); }
-
 void TestUpdatePositionBatch() {
   const int kNumRows = 10;
   RowPartitioner rp(0, kNumRows);
@@ -203,16 +116,5 @@ void TestFinalise() {
 
 TEST(RowPartitioner, Finalise) { TestFinalise(); }
 
-void TestIncorrectRow() {
-  RowPartitioner rp(0, 1);
-  rp.UpdatePosition(0, 1, 2, [=]__device__ (RowPartitioner::RowIndexT ridx)
-  {
-    return 4; // This is not the left branch or the right branch
-  });
-}
-
-TEST(RowPartitionerDeathTest, IncorrectRow) {
-  ASSERT_DEATH({ TestIncorrectRow(); },".*");
-}
 }  // namespace tree
 }  // namespace xgboost

From bd480822f9127b2420c6d24ee4462d56acbbb4b6 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Sat, 14 May 2022 06:55:46 -0700
Subject: [PATCH 19/64] experiments

---
 src/tree/gpu_hist/row_partitioner.cuh   |  2 +-
 tests/cpp/common/test_device_helpers.cu | 74 ++++++++++++++++++++++++-
 2 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index c6b9b763ff1d..e779ced8dfdf 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -48,7 +48,7 @@ __global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs<OpDataT> args,
 
   auto left_nidx = args.left_nidx_batch[blockIdx.x];
   auto left_count = dh::BlockPartition<kBlockSize>().Partition(
-      ridx_segment.begin(), ridx_segment.end(), [=] __device__(auto e) { return op(e, data) == left_nidx; });
+      ridx_segment.data(), ridx_segment.data()+ridx_segment.size(), [=] __device__(auto e) { return op(e, data) == left_nidx; });
 
   if (threadIdx.x == 0) {
     left_counts[blockIdx.x] = left_count;
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index ec9d3af45a45..7ea6cdb1a316 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -266,10 +266,76 @@ TEST(AtomicAdd, Int64) {
   TestAtomicAdd();
 }
 
+template <int kBlockSize>
+class BlockPartitionTune {
+ public:
+  template <typename IterT, typename OpT>
+  __device__ std::size_t Partition(IterT begin, IterT end, OpT op) {
+    typedef cub::BlockScan<std::size_t, kBlockSize> BlockScanT;
+    __shared__ typename BlockScanT::TempStorage temp1, temp2;
+    __shared__ std::size_t lcomp[kBlockSize];
+    __shared__ std::size_t rcomp[kBlockSize];
+
+    /*
+    __shared__ unsigned long long int tmp_sum;
+
+    if (threadIdx.x == 0) {
+      tmp_sum = 0;
+    }
+    __syncthreads();
+
+    // Get left count
+    std::size_t count = end - begin;
+    std::size_t left_count = 0;
+    for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) {
+      left_count += op(begin[idx]);
+    }
+    atomicAdd(&tmp_sum, left_count);
+    __syncthreads();
+    left_count = tmp_sum;
+    */
+    std::size_t count = end - begin;
+    std::size_t left_count = count/2;
+
+    std::size_t loffset = 0, part = left_count, roffset = part;
+    std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0;
+    auto tid = threadIdx.x;
+    while (loffset < part && roffset < count) {
+      // find the samples in the left that belong to right and vice-versa
+      auto loff = loffset + tid, roff = roffset + tid;
+      if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0;
+      if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0;
+      // scan to compute the locations for each 'misfit' in the two partitions
+      std::size_t lidx, ridx;
+      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
+      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
+      //__syncthreads();
+      minlen = llen < rlen ? llen : rlen;
+      // compaction to figure out the right locations to swap
+      if (lflag) lcomp[lidx] = loff;
+      if (rflag) rcomp[ridx] = roff;
+      __syncthreads();
+      // reset the appropriate flags for the longer of the two
+      if (lidx < minlen) lflag = 0;
+      if (ridx < minlen) rflag = 0;
+      if (llen == minlen) loffset += kBlockSize;
+      if (rlen == minlen) roffset += kBlockSize;
+      // swap the 'misfit's
+      if (tid < minlen) {
+        auto a = begin[lcomp[tid]];
+        auto b = begin[rcomp[tid]];
+        begin[lcomp[tid]] = b;
+        begin[rcomp[tid]] = a;
+      }
+    }
+    return left_count;
+  }
+};
+
 
 template <int kBlockSize, typename OpT>
 __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) {
-  auto count = dh::BlockPartition<kBlockSize>().Partition(begin, end, op);
+  auto count = BlockPartitionTune<kBlockSize>().Partition(begin, end, op);
   if (threadIdx.x == 0) {
     *count_out = count;
   }
@@ -336,4 +402,10 @@ TEST(BlockPartition, BlockPartition) {
   }
 }
 
+TEST(BlockPartition, BlockPartitionBenchmark) {
+  thrust::device_vector<int> x(10000000);
+  thrust::sequence(x. begin(),x.end());
+  TestBlockPartition<1024>(x);
+}
+
 }  // namespace xgboost

From c3ef1f66e05e687ecded030587933c750cf1db6b Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Sat, 14 May 2022 15:13:29 -0700
Subject: [PATCH 20/64] Improvements

---
 tests/cpp/common/test_device_helpers.cu | 111 ++++++++++++++++++------
 1 file changed, 85 insertions(+), 26 deletions(-)

diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 7ea6cdb1a316..175e57b54062 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -266,18 +266,18 @@ TEST(AtomicAdd, Int64) {
   TestAtomicAdd();
 }
 
+/*
 template <int kBlockSize>
 class BlockPartitionTune {
  public:
   template <typename IterT, typename OpT>
-  __device__ std::size_t Partition(IterT begin, IterT end, OpT op) {
-    typedef cub::BlockScan<std::size_t, kBlockSize> BlockScanT;
+  __device__ int Partition(IterT begin, IterT end, OpT op) {
+    typedef cub::BlockScan<int, kBlockSize> BlockScanT;
     __shared__ typename BlockScanT::TempStorage temp1, temp2;
-    __shared__ std::size_t lcomp[kBlockSize];
-    __shared__ std::size_t rcomp[kBlockSize];
+    __shared__ int lcomp[kBlockSize];
+    __shared__ int rcomp[kBlockSize];
 
-    /*
-    __shared__ unsigned long long int tmp_sum;
+    __shared__ int64_t tmp_sum;
 
     if (threadIdx.x == 0) {
       tmp_sum = 0;
@@ -285,41 +285,98 @@ class BlockPartitionTune {
     __syncthreads();
 
     // Get left count
-    std::size_t count = end - begin;
-    std::size_t left_count = 0;
-    for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) {
+    int count = end - begin;
+    int left_count = 0;
+    for (auto idx : dh::BlockStrideRange(int(0), count)) {
       left_count += op(begin[idx]);
     }
     atomicAdd(&tmp_sum, left_count);
     __syncthreads();
     left_count = tmp_sum;
-    */
-    std::size_t count = end - begin;
-    std::size_t left_count = count/2;
+    int loffset = 0, part = left_count, roffset = part;
+    int llen = 0, rlen = 0, minlen = 0;
+    auto tid = threadIdx.x;
+    while (loffset < part && roffset < count) {
+      // find the samples in the left that belong to right and vice-versa
+      auto loff = loffset + tid, roff = roffset + tid;
+      int lflag  = loff < part ? !op(begin[loff]) : 0;
+      int rflag = roff < count ? op(begin[roff]) : 0;
+      // scan to compute the locations for each 'misfit' in the two partitions
+      int lidx, ridx;
+      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
+      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
+      __syncthreads();
+      minlen = llen < rlen ? llen : rlen;
+      // compaction to figure out the right locations to swap
+      if (lflag) lcomp[lidx] = loff;
+      if (rflag) rcomp[ridx] = roff;
+      __syncthreads();
+      loffset += (llen == minlen) ? kBlockSize : minlen;
+      roffset += (rlen == minlen) ? kBlockSize : minlen;
+      //  swap the 'misfit's
+      if (tid < minlen) {
+        auto a = begin[lcomp[tid]];
+        auto b = begin[rcomp[tid]];
+        begin[lcomp[tid]] = b;
+        begin[rcomp[tid]] = a;
+      }
+    }
+    return left_count;
+  }
+};
+*/
+struct PartitionScanPair{
+  int left;
+  int right;
+};
+
+template <int kBlockSize>
+class BlockPartitionTune {
+ public:
+  template <typename IterT, typename OpT>
+  __device__ int Partition(IterT begin, IterT end, OpT op) {
+    typedef cub::BlockScan<int, kBlockSize> BlockScanT;
+    __shared__ typename BlockScanT::TempStorage temp1, temp2;
+    __shared__ int lcomp[kBlockSize];
+    __shared__ int rcomp[kBlockSize];
+    __shared__ int64_t tmp_sum;
+
+    if (threadIdx.x == 0) {
+      tmp_sum = 0;
+    }
+    __syncthreads();
 
-    std::size_t loffset = 0, part = left_count, roffset = part;
-    std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0;
+    // Get left count
+    int count = end - begin;
+    int left_count = 0;
+    for (auto idx : dh::BlockStrideRange(int(0), count)) {
+      left_count += op(begin[idx]);
+    }
+    atomicAdd(&tmp_sum, left_count);
+    __syncthreads();
+    left_count = tmp_sum;
+
+    int loffset = 0, part = left_count, roffset = part;
+    int llen = 0, rlen = 0, minlen = 0;
     auto tid = threadIdx.x;
     while (loffset < part && roffset < count) {
       // find the samples in the left that belong to right and vice-versa
       auto loff = loffset + tid, roff = roffset + tid;
-      if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0;
-      if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0;
+      int lflag = loff < part ? !op(begin[loff]) : 0;
+      int rflag = roff < count ? op(begin[roff]) : 0;
       // scan to compute the locations for each 'misfit' in the two partitions
-      std::size_t lidx, ridx;
+      int lidx, ridx;
       BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
       BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
-      //__syncthreads();
       minlen = llen < rlen ? llen : rlen;
       // compaction to figure out the right locations to swap
       if (lflag) lcomp[lidx] = loff;
       if (rflag) rcomp[ridx] = roff;
       __syncthreads();
+
       // reset the appropriate flags for the longer of the two
-      if (lidx < minlen) lflag = 0;
-      if (ridx < minlen) rflag = 0;
-      if (llen == minlen) loffset += kBlockSize;
-      if (rlen == minlen) roffset += kBlockSize;
+      loffset = llen == minlen || llen == 0 ? loffset + kBlockSize : lcomp[minlen];
+      roffset = rlen == minlen || rlen == 0 ? roffset + kBlockSize : rcomp[minlen];
       // swap the 'misfit's
       if (tid < minlen) {
         auto a = begin[lcomp[tid]];
@@ -332,7 +389,6 @@ class BlockPartitionTune {
   }
 };
 
-
 template <int kBlockSize, typename OpT>
 __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) {
   auto count = BlockPartitionTune<kBlockSize>().Partition(begin, end, op);
@@ -403,9 +459,12 @@ TEST(BlockPartition, BlockPartition) {
 }
 
 TEST(BlockPartition, BlockPartitionBenchmark) {
-  thrust::device_vector<int> x(10000000);
-  thrust::sequence(x. begin(),x.end());
-  TestBlockPartition<1024>(x);
+  for (int i = 0; i < 20; i++) {
+    thrust::device_vector<int> x(10000000);
+    MakeRandom(x, i);
+    // thrust::sequence(x.begin(), x.end());
+    TestBlockPartition<1024>(x);
+  }
 }
 
 }  // namespace xgboost

From ba8bbdfd1bf1227b574eea7a3d4d49089e579118 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Sat, 14 May 2022 15:24:10 -0700
Subject: [PATCH 21/64] Fused scan

---
 tests/cpp/common/test_device_helpers.cu | 35 +++++++++++++++----------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 175e57b54062..cab348e12f38 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -325,18 +325,23 @@ class BlockPartitionTune {
   }
 };
 */
-struct PartitionScanPair{
+struct PartitionScanPair {
   int left;
   int right;
 };
 
+__device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) {
+  PartitionScanPair c{a.left + b.left, a.right + b.right};
+  return c;
+}
+
 template <int kBlockSize>
 class BlockPartitionTune {
  public:
   template <typename IterT, typename OpT>
   __device__ int Partition(IterT begin, IterT end, OpT op) {
-    typedef cub::BlockScan<int, kBlockSize> BlockScanT;
-    __shared__ typename BlockScanT::TempStorage temp1, temp2;
+    typedef cub::BlockScan<PartitionScanPair, kBlockSize> BlockScanT;
+    __shared__ typename BlockScanT::TempStorage temp;
     __shared__ int lcomp[kBlockSize];
     __shared__ int rcomp[kBlockSize];
     __shared__ int64_t tmp_sum;
@@ -357,26 +362,28 @@ class BlockPartitionTune {
     left_count = tmp_sum;
 
     int loffset = 0, part = left_count, roffset = part;
-    int llen = 0, rlen = 0, minlen = 0;
+    int minlen = 0;
     auto tid = threadIdx.x;
     while (loffset < part && roffset < count) {
       // find the samples in the left that belong to right and vice-versa
       auto loff = loffset + tid, roff = roffset + tid;
-      int lflag = loff < part ? !op(begin[loff]) : 0;
-      int rflag = roff < count ? op(begin[roff]) : 0;
+      
+      PartitionScanPair flag;
+      flag.left = loff < part ? !op(begin[loff]) : 0;
+      flag.right = roff < count ? op(begin[roff]) : 0;
       // scan to compute the locations for each 'misfit' in the two partitions
-      int lidx, ridx;
-      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
-      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
-      minlen = llen < rlen ? llen : rlen;
+      PartitionScanPair partial_sum;
+      PartitionScanPair sum;
+      BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum);
+      minlen = sum.left < sum.right ? sum.left : sum.right;
       // compaction to figure out the right locations to swap
-      if (lflag) lcomp[lidx] = loff;
-      if (rflag) rcomp[ridx] = roff;
+      if (flag.left) lcomp[partial_sum.left] = loff;
+      if (flag.right) rcomp[partial_sum.right] = roff;
       __syncthreads();
 
       // reset the appropriate flags for the longer of the two
-      loffset = llen == minlen || llen == 0 ? loffset + kBlockSize : lcomp[minlen];
-      roffset = rlen == minlen || rlen == 0 ? roffset + kBlockSize : rcomp[minlen];
+      loffset = sum.left== minlen || sum.left== 0 ? loffset + kBlockSize : lcomp[minlen];
+      roffset = sum.right == minlen || sum.right == 0 ? roffset + kBlockSize : rcomp[minlen];
       // swap the 'misfit's
       if (tid < minlen) {
         auto a = begin[lcomp[tid]];

From f4ef4ca798417b8630e7a018ea1596f469e37048 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Sun, 15 May 2022 06:44:52 -0700
Subject: [PATCH 22/64] Register blocking

---
 tests/cpp/common/test_device_helpers.cu | 44 +++++++++++++------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index cab348e12f38..c685d7622164 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -338,12 +338,12 @@ __device__ PartitionScanPair operator+(const PartitionScanPair& a, const Partiti
 template <int kBlockSize>
 class BlockPartitionTune {
  public:
-  template <typename IterT, typename OpT>
+  template <typename IterT, typename OpT,int kItemsPerThread=4>
   __device__ int Partition(IterT begin, IterT end, OpT op) {
     typedef cub::BlockScan<PartitionScanPair, kBlockSize> BlockScanT;
     __shared__ typename BlockScanT::TempStorage temp;
-    __shared__ int lcomp[kBlockSize];
-    __shared__ int rcomp[kBlockSize];
+    __shared__ int lcomp[kBlockSize*kItemsPerThread];
+    __shared__ int rcomp[kBlockSize*kItemsPerThread];
     __shared__ int64_t tmp_sum;
 
     if (threadIdx.x == 0) {
@@ -362,34 +362,36 @@ class BlockPartitionTune {
     left_count = tmp_sum;
 
     int loffset = 0, part = left_count, roffset = part;
-    int minlen = 0;
     auto tid = threadIdx.x;
     while (loffset < part && roffset < count) {
       // find the samples in the left that belong to right and vice-versa
-      auto loff = loffset + tid, roff = roffset + tid;
-      
-      PartitionScanPair flag;
-      flag.left = loff < part ? !op(begin[loff]) : 0;
-      flag.right = roff < count ? op(begin[roff]) : 0;
+      auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread;
+
+      PartitionScanPair flag[kItemsPerThread];
+      for (int i = 0; i < kItemsPerThread; i++) {
+        flag[i].left = loff+i < part ? !op(begin[loff+i]) : 0;
+        flag[i].right = roff+i < count ? op(begin[roff+i]) : 0;
+      }
       // scan to compute the locations for each 'misfit' in the two partitions
-      PartitionScanPair partial_sum;
+      PartitionScanPair partial_sum[kItemsPerThread];
       PartitionScanPair sum;
       BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum);
-      minlen = sum.left < sum.right ? sum.left : sum.right;
+      int minlen = sum.left < sum.right ? sum.left : sum.right;
       // compaction to figure out the right locations to swap
-      if (flag.left) lcomp[partial_sum.left] = loff;
-      if (flag.right) rcomp[partial_sum.right] = roff;
+      for (int i = 0; i < kItemsPerThread; i++) {
+        if (flag[i].left) lcomp[partial_sum[i].left] = loff+i;
+        if (flag[i].right) rcomp[partial_sum[i].right] = roff+i;
+      }
       __syncthreads();
 
-      // reset the appropriate flags for the longer of the two
-      loffset = sum.left== minlen || sum.left== 0 ? loffset + kBlockSize : lcomp[minlen];
-      roffset = sum.right == minlen || sum.right == 0 ? roffset + kBlockSize : rcomp[minlen];
+      loffset = sum.left == minlen ? loffset + kBlockSize*kItemsPerThread : lcomp[minlen];
+      roffset = sum.right == minlen ? roffset + kBlockSize*kItemsPerThread  : rcomp[minlen];
       // swap the 'misfit's
-      if (tid < minlen) {
-        auto a = begin[lcomp[tid]];
-        auto b = begin[rcomp[tid]];
-        begin[lcomp[tid]] = b;
-        begin[rcomp[tid]] = a;
+      for(int i = tid; i < minlen; i += kBlockSize){
+        auto a = begin[lcomp[i]];
+        auto b = begin[rcomp[i]];
+        begin[lcomp[i]] = b;
+        begin[rcomp[i]] = a;
       }
     }
     return left_count;

From 9c27dd09df727f02c51df892e9aa8fa19f421f4a Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 17 May 2022 02:17:13 -0700
Subject: [PATCH 23/64] Cleanup

---
 src/common/device_helpers.cuh           |  77 ++++++++------
 src/tree/gpu_hist/row_partitioner.cuh   |  19 ++--
 tests/cpp/common/test_device_helpers.cu | 135 +-----------------------
 3 files changed, 58 insertions(+), 173 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 20cb951e8805..3fdb994fce3d 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1640,15 +1640,26 @@ class CUDAStream {
   void Sync() { this->View().Sync(); }
 };
 
+struct PartitionScanPair {
+  int left;
+  int right;
+};
+
+inline __device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) {
+  PartitionScanPair c{a.left + b.left, a.right + b.right};
+  return c;
+}
+
 template <int kBlockSize>
-class BlockPartition {
+class BlockPartition{
  public:
-  template <typename IterT, typename OpT>
-  __device__ std::size_t Partition(IterT begin, IterT end, OpT op) {
-    typedef cub::BlockScan<std::size_t, kBlockSize> BlockScanT;
-    __shared__ typename BlockScanT::TempStorage temp1, temp2;
-    __shared__ std::size_t lcomp[kBlockSize];
-    __shared__ std::size_t rcomp[kBlockSize];
+  template <typename IterT, typename OpT,int kItemsPerThread=4>
+  __device__ int Partition(IterT begin, IterT end, OpT op) {
+    typedef cub::BlockScan<PartitionScanPair, kBlockSize> BlockScanT;
+    __shared__ typename BlockScanT::TempStorage temp;
+
+    __shared__ int16_t lcomp[kBlockSize*kItemsPerThread];
+    __shared__ int16_t rcomp[kBlockSize*kItemsPerThread];
     __shared__ unsigned long long int tmp_sum;
 
     if (threadIdx.x == 0) {
@@ -1657,45 +1668,47 @@ class BlockPartition {
     __syncthreads();
 
     // Get left count
-    std::size_t count = end - begin;
-    std::size_t left_count = 0;
-    for (auto idx : dh::BlockStrideRange(std::size_t(0), count)) {
+    int count = end - begin;
+    int left_count = 0;
+    for (auto idx : dh::BlockStrideRange(int(0), count)) {
       left_count += op(begin[idx]);
     }
     atomicAdd(&tmp_sum, left_count);
     __syncthreads();
     left_count = tmp_sum;
 
-    std::size_t loffset = 0, part = left_count, roffset = part;
-    std::size_t lflag = 0, rflag = 0, llen = 0, rlen = 0, minlen = 0;
+    int loffset = 0, part = left_count, roffset = part;
     auto tid = threadIdx.x;
     while (loffset < part && roffset < count) {
       // find the samples in the left that belong to right and vice-versa
-      auto loff = loffset + tid, roff = roffset + tid;
-      if (llen == minlen) lflag = loff < part ? !op(begin[loff]) : 0;
-      if (rlen == minlen) rflag = roff < count ? op(begin[roff]) : 0;
+      auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread;
+
+      PartitionScanPair flag[kItemsPerThread];
+      for (int i = 0; i < kItemsPerThread; i++) {
+        flag[i].left = loff + i < part ? !op(begin[loff + i]) : 0;
+        flag[i].right = roff + i < count ? op(begin[roff + i]) : 0;
+      }
       // scan to compute the locations for each 'misfit' in the two partitions
-      std::size_t lidx, ridx;
-      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
-      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
-      __syncthreads();
-      minlen = llen < rlen ? llen : rlen;
+      PartitionScanPair partial_sum[kItemsPerThread];
+      PartitionScanPair sum;
+      BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum);
+      int minlen = sum.left < sum.right ? sum.left : sum.right;
       // compaction to figure out the right locations to swap
-      if (lflag) lcomp[lidx] = loff;
-      if (rflag) rcomp[ridx] = roff;
+      for (int i = 0; i < kItemsPerThread; i++) {
+        if (flag[i].left) lcomp[partial_sum[i].left] = tid * kItemsPerThread+i;
+        if (flag[i].right) rcomp[partial_sum[i].right] = tid * kItemsPerThread+i;
+      }
       __syncthreads();
-      // reset the appropriate flags for the longer of the two
-      if (lidx < minlen) lflag = 0;
-      if (ridx < minlen) rflag = 0;
-      if (llen == minlen) loffset += kBlockSize;
-      if (rlen == minlen) roffset += kBlockSize;
+
       // swap the 'misfit's
-      if (tid < minlen) {
-        auto a = begin[lcomp[tid]];
-        auto b = begin[rcomp[tid]];
-        begin[lcomp[tid]] = b;
-        begin[rcomp[tid]] = a;
+      for (int i = tid; i < minlen; i += kBlockSize) {
+        auto a = begin[lcomp[i] + loffset];
+        auto b = begin[rcomp[i] + roffset];
+        begin[lcomp[i] + loffset] = b;
+        begin[rcomp[i] + roffset] = a;
       }
+      loffset = sum.left == minlen ? loffset + kBlockSize * kItemsPerThread : loffset + lcomp[minlen];
+      roffset = sum.right == minlen ? roffset + kBlockSize * kItemsPerThread : roffset + rcomp[minlen];
     }
     return left_count;
   }
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index e779ced8dfdf..d1a374ec8468 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -37,18 +37,23 @@ struct UpdatePositionBatchArgs {
 };
 
 template <int kBlockSize, typename OpDataT, typename OpT>
-__global__ void UpdatePositionBatchKernel(UpdatePositionBatchArgs<OpDataT> args,
+__global__ void 
+__launch_bounds__(1024, 1)
+UpdatePositionBatchKernel(UpdatePositionBatchArgs<OpDataT> args,
                                           OpT op, common::Span<bst_uint> ridx,
                                           common::Span<bst_node_t> position,
                                           common::Span<int64_t> left_counts) {
-  auto segment = args.segments_batch[blockIdx.x];
-  auto data = args.data_batch[blockIdx.x];
-  auto ridx_segment = ridx.subspan(segment.begin, segment.Size());
-  auto position_segment = position.subspan(segment.begin, segment.Size());
+
+
+  const auto& segment = args.segments_batch[blockIdx.x];
+  const auto& data = args.data_batch[blockIdx.x];
+  const auto& ridx_segment = ridx.subspan(segment.begin, segment.Size());
+  const auto& position_segment = position.subspan(segment.begin, segment.Size());
 
   auto left_nidx = args.left_nidx_batch[blockIdx.x];
   auto left_count = dh::BlockPartition<kBlockSize>().Partition(
-      ridx_segment.data(), ridx_segment.data()+ridx_segment.size(), [=] __device__(auto e) { return op(e, data) == left_nidx; });
+      ridx_segment.data(), ridx_segment.data() + ridx_segment.size(),
+      [&] __device__(auto e) { return op(e, data) == left_nidx; });
 
   if (threadIdx.x == 0) {
     left_counts[blockIdx.x] = left_count;
@@ -138,7 +143,7 @@ class RowPartitioner {
     }
 
     // 1 block per node
-    constexpr int kBlockSize = 512;
+    constexpr int kBlockSize = 1024;
     UpdatePositionBatchKernel<kBlockSize><<<nidx.size(), kBlockSize>>>(
         args, op, dh::ToSpan(ridx_),
         dh::ToSpan(position_), left_counts);
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index c685d7622164..18fdb5b7eb34 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -266,141 +266,9 @@ TEST(AtomicAdd, Int64) {
   TestAtomicAdd();
 }
 
-/*
-template <int kBlockSize>
-class BlockPartitionTune {
- public:
-  template <typename IterT, typename OpT>
-  __device__ int Partition(IterT begin, IterT end, OpT op) {
-    typedef cub::BlockScan<int, kBlockSize> BlockScanT;
-    __shared__ typename BlockScanT::TempStorage temp1, temp2;
-    __shared__ int lcomp[kBlockSize];
-    __shared__ int rcomp[kBlockSize];
-
-    __shared__ int64_t tmp_sum;
-
-    if (threadIdx.x == 0) {
-      tmp_sum = 0;
-    }
-    __syncthreads();
-
-    // Get left count
-    int count = end - begin;
-    int left_count = 0;
-    for (auto idx : dh::BlockStrideRange(int(0), count)) {
-      left_count += op(begin[idx]);
-    }
-    atomicAdd(&tmp_sum, left_count);
-    __syncthreads();
-    left_count = tmp_sum;
-    int loffset = 0, part = left_count, roffset = part;
-    int llen = 0, rlen = 0, minlen = 0;
-    auto tid = threadIdx.x;
-    while (loffset < part && roffset < count) {
-      // find the samples in the left that belong to right and vice-versa
-      auto loff = loffset + tid, roff = roffset + tid;
-      int lflag  = loff < part ? !op(begin[loff]) : 0;
-      int rflag = roff < count ? op(begin[roff]) : 0;
-      // scan to compute the locations for each 'misfit' in the two partitions
-      int lidx, ridx;
-      BlockScanT(temp1).ExclusiveSum(lflag, lidx, llen);
-      BlockScanT(temp2).ExclusiveSum(rflag, ridx, rlen);
-      __syncthreads();
-      minlen = llen < rlen ? llen : rlen;
-      // compaction to figure out the right locations to swap
-      if (lflag) lcomp[lidx] = loff;
-      if (rflag) rcomp[ridx] = roff;
-      __syncthreads();
-      loffset += (llen == minlen) ? kBlockSize : minlen;
-      roffset += (rlen == minlen) ? kBlockSize : minlen;
-      //  swap the 'misfit's
-      if (tid < minlen) {
-        auto a = begin[lcomp[tid]];
-        auto b = begin[rcomp[tid]];
-        begin[lcomp[tid]] = b;
-        begin[rcomp[tid]] = a;
-      }
-    }
-    return left_count;
-  }
-};
-*/
-struct PartitionScanPair {
-  int left;
-  int right;
-};
-
-__device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) {
-  PartitionScanPair c{a.left + b.left, a.right + b.right};
-  return c;
-}
-
-template <int kBlockSize>
-class BlockPartitionTune {
- public:
-  template <typename IterT, typename OpT,int kItemsPerThread=4>
-  __device__ int Partition(IterT begin, IterT end, OpT op) {
-    typedef cub::BlockScan<PartitionScanPair, kBlockSize> BlockScanT;
-    __shared__ typename BlockScanT::TempStorage temp;
-    __shared__ int lcomp[kBlockSize*kItemsPerThread];
-    __shared__ int rcomp[kBlockSize*kItemsPerThread];
-    __shared__ int64_t tmp_sum;
-
-    if (threadIdx.x == 0) {
-      tmp_sum = 0;
-    }
-    __syncthreads();
-
-    // Get left count
-    int count = end - begin;
-    int left_count = 0;
-    for (auto idx : dh::BlockStrideRange(int(0), count)) {
-      left_count += op(begin[idx]);
-    }
-    atomicAdd(&tmp_sum, left_count);
-    __syncthreads();
-    left_count = tmp_sum;
-
-    int loffset = 0, part = left_count, roffset = part;
-    auto tid = threadIdx.x;
-    while (loffset < part && roffset < count) {
-      // find the samples in the left that belong to right and vice-versa
-      auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread;
-
-      PartitionScanPair flag[kItemsPerThread];
-      for (int i = 0; i < kItemsPerThread; i++) {
-        flag[i].left = loff+i < part ? !op(begin[loff+i]) : 0;
-        flag[i].right = roff+i < count ? op(begin[roff+i]) : 0;
-      }
-      // scan to compute the locations for each 'misfit' in the two partitions
-      PartitionScanPair partial_sum[kItemsPerThread];
-      PartitionScanPair sum;
-      BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum);
-      int minlen = sum.left < sum.right ? sum.left : sum.right;
-      // compaction to figure out the right locations to swap
-      for (int i = 0; i < kItemsPerThread; i++) {
-        if (flag[i].left) lcomp[partial_sum[i].left] = loff+i;
-        if (flag[i].right) rcomp[partial_sum[i].right] = roff+i;
-      }
-      __syncthreads();
-
-      loffset = sum.left == minlen ? loffset + kBlockSize*kItemsPerThread : lcomp[minlen];
-      roffset = sum.right == minlen ? roffset + kBlockSize*kItemsPerThread  : rcomp[minlen];
-      // swap the 'misfit's
-      for(int i = tid; i < minlen; i += kBlockSize){
-        auto a = begin[lcomp[i]];
-        auto b = begin[rcomp[i]];
-        begin[lcomp[i]] = b;
-        begin[rcomp[i]] = a;
-      }
-    }
-    return left_count;
-  }
-};
-
 template <int kBlockSize, typename OpT>
 __global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) {
-  auto count = BlockPartitionTune<kBlockSize>().Partition(begin, end, op);
+  auto count = dh::BlockPartition<kBlockSize>().Partition(begin, end, op);
   if (threadIdx.x == 0) {
     *count_out = count;
   }
@@ -471,7 +339,6 @@ TEST(BlockPartition, BlockPartitionBenchmark) {
   for (int i = 0; i < 20; i++) {
     thrust::device_vector<int> x(10000000);
     MakeRandom(x, i);
-    // thrust::sequence(x.begin(), x.end());
     TestBlockPartition<1024>(x);
   }
 }

From 0bcc84ac7d2420b1df74293cfe120999b0f9a9d8 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 18 May 2022 04:38:34 -0700
Subject: [PATCH 24/64] Working tests

---
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 155 ++++++++++++++++++
 1 file changed, 155 insertions(+)

diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index e4e5c9dacb60..70412832802a 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -116,5 +116,160 @@ void TestFinalise() {
 
 TEST(RowPartitioner, Finalise) { TestFinalise(); }
 
+
+const int kMaxBatch = 32;
+template <typename OpDataT>
+struct KernelArgs {
+  Segment segments[kMaxBatch];
+  OpDataT data[kMaxBatch];
+
+  // Given a global thread idx, assign it to an item from one of the segments
+  __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const {
+    std::size_t sum = 0;
+    for (int i = 0; i < kMaxBatch; i++) {
+      if (sum + segments[i].Size() > idx) {
+        batch_idx = i;
+        item_idx = (idx - sum) + segments[i].begin;
+        break;
+      }
+      sum += segments[i].Size();
+    }
+  }
+  std::size_t TotalRows() const {
+    std::size_t total_rows = 0;
+    for (auto segment : segments) {
+      total_rows += segment.Size();
+    }
+    return total_rows;
+  }
+};
+
+template <typename RowIndexT, typename OpT, typename OpDataT>
+void GetLeftCounts(const KernelArgs<OpDataT>&args,common::Span<RowIndexT> ridx,
+                   common::Span<unsigned long long int> d_left_counts, OpT op
+                   ) {
+
+  // Launch 1 thread for each row
+  dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) {
+    // Assign this thread to a row
+    int batch_idx;
+    std::size_t item_idx;
+    args.AssignBatch(idx, batch_idx, item_idx);
+    auto op_res = op(ridx[item_idx], args.data[batch_idx]);
+    atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx]));
+  });
+}
+
+struct IndexFlagTuple {
+  size_t idx;
+  bool flag;
+  size_t flag_scan;
+  int batch_idx;
+};
+
+struct IndexFlagOp {
+  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
+    if (a.batch_idx == b.batch_idx) {
+      return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx};
+    } else {
+      return b;
+    }
+  }
+};
+
+template<typename OpDataT,typename OpT>
+struct WriteResultsFunctor {
+  KernelArgs<OpDataT> args;
+  OpT op;
+  common::Span<RowPartitioner::RowIndexT> ridx_in;
+  common::Span<RowPartitioner::RowIndexT> ridx_out;
+  common::Span<unsigned long long int> left_counts;
+
+  __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
+    // the ex_scan_result represents how many rows have been assigned to left
+    // node so far during scan.
+    std::size_t scatter_address;
+    if (x.flag) {
+      scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1;  // -1 because inclusive scan
+    } else {
+      // current number of rows belong to right node + total number of rows
+      // belong to left node
+      scatter_address  = (x.idx - x.flag_scan) + left_counts[x.batch_idx];
+    }
+    ridx_out[scatter_address] = ridx_in[x.idx];
+    // Discard
+    return {};
+  }
+};
+
+template <typename RowIndexT, typename OpT, typename OpDataT>
+void SortPositionBatch(const KernelArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
+                       common::Span<RowIndexT> ridx_tmp,
+                       common::Span<unsigned long long int> left_counts, OpT op,
+                       cudaStream_t stream) {
+  WriteResultsFunctor<OpDataT,OpT> write_results{args,op,ridx, ridx_tmp, left_counts};
+  auto discard_write_iterator =
+      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto input_iterator =
+      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
+        int batch_idx;
+        std::size_t item_idx;
+        args.AssignBatch(idx, batch_idx, item_idx);
+        auto go_left = op(ridx[item_idx], args.data[batch_idx]);
+        return IndexFlagTuple{item_idx, go_left,go_left, batch_idx};
+      });
+  size_t temp_bytes = 0;
+  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
+                                 discard_write_iterator, IndexFlagOp(),
+                                 args.TotalRows(), stream);
+  dh::TemporaryArray<int8_t> temp(temp_bytes);
+  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
+                                 discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
+
+  // copy active segments back to original buffer
+  dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) {
+    // Assign this thread to a row
+    int batch_idx;
+    std::size_t item_idx;
+    args.AssignBatch(idx, batch_idx, item_idx);
+    ridx[item_idx] = ridx_tmp[item_idx];
+  });
+}
+
+void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
+  thrust::device_vector<uint32_t> ridx = ridx_in;
+  thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
+  thrust::device_vector<unsigned long long int> left_counts(segments.size());
+
+  auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
+  std::vector<int> op_data(segments.size());
+  KernelArgs<int> args;
+  std::copy(segments.begin(), segments.end(), args.segments);
+  std::copy(op_data.begin(), op_data.end(), args.data);
+  GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(left_counts), op);
+  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(left_counts), op,
+                    nullptr);
+
+  auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
+  for (int i = 0; i < segments.size(); i++) {
+    auto begin = ridx.begin() + segments[i].begin;
+    auto end = ridx.begin() + segments[i].end;
+    auto left_partition_count =
+        thrust::count_if(thrust::device, begin, begin + left_counts[i], op_without_data);
+    EXPECT_EQ(left_partition_count, left_counts[i]);
+    auto right_partition_count =
+        thrust::count_if(thrust::device, begin + left_counts[i], end, op_without_data);
+    EXPECT_EQ(right_partition_count, 0);
+  }
+}
+
+TEST(GpuHist, SortPositionBatch) { 
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 3}, {3, 6}}); 
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 1}, {3, 6}}); 
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{0, 6}});
+  TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
+}
+
 }  // namespace tree
 }  // namespace xgboost

From 723ff475fda9e3762f5f9af9424e53c0f952dffc Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 18 May 2022 05:08:38 -0700
Subject: [PATCH 25/64] Transplanted new code

---
 src/tree/gpu_hist/row_partitioner.cu          |   2 +-
 src/tree/gpu_hist/row_partitioner.cuh         | 168 ++++++++++++++----
 src/tree/updater_gpu_hist.cu                  |  12 +-
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 132 +-------------
 4 files changed, 135 insertions(+), 179 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 8fbded53f913..91933c6d2896 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -20,7 +20,7 @@ void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
 }
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows), position_(num_rows) {
+    : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),position_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(Segment(0, num_rows));
 
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index d1a374ec8468..6de8ddff3993 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -26,38 +26,125 @@ struct Segment {
   __host__ __device__ size_t Size() const { return end - begin; }
 };
 
-constexpr int kUpdatePositionMaxBatch = 32;
+
+const int kMaxBatch = 32;
 template <typename OpDataT>
-struct UpdatePositionBatchArgs {
-  bst_node_t nidx_batch[kUpdatePositionMaxBatch];
-  bst_node_t left_nidx_batch[kUpdatePositionMaxBatch];
-  bst_node_t right_nidx_batch[kUpdatePositionMaxBatch];
-  Segment segments_batch[kUpdatePositionMaxBatch];
-  OpDataT data_batch[kUpdatePositionMaxBatch];
+struct KernelArgs {
+  Segment segments[kMaxBatch];
+  OpDataT data[kMaxBatch];
+
+  // Given a global thread idx, assign it to an item from one of the segments
+  __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const {
+    std::size_t sum = 0;
+    for (int i = 0; i < kMaxBatch; i++) {
+      if (sum + segments[i].Size() > idx) {
+        batch_idx = i;
+        item_idx = (idx - sum) + segments[i].begin;
+        break;
+      }
+      sum += segments[i].Size();
+    }
+  }
+  std::size_t TotalRows() const {
+    std::size_t total_rows = 0;
+    for (auto segment : segments) {
+      total_rows += segment.Size();
+    }
+    return total_rows;
+  }
 };
 
-template <int kBlockSize, typename OpDataT, typename OpT>
-__global__ void 
-__launch_bounds__(1024, 1)
-UpdatePositionBatchKernel(UpdatePositionBatchArgs<OpDataT> args,
-                                          OpT op, common::Span<bst_uint> ridx,
-                                          common::Span<bst_node_t> position,
-                                          common::Span<int64_t> left_counts) {
+template <typename RowIndexT, typename OpT, typename OpDataT>
+void GetLeftCounts(const KernelArgs<OpDataT>&args,common::Span<RowIndexT> ridx,
+                   common::Span<unsigned long long int> d_left_counts, OpT op
+                   ) {
 
+  // Launch 1 thread for each row
+  dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) {
+    // Assign this thread to a row
+    int batch_idx;
+    std::size_t item_idx;
+    args.AssignBatch(idx, batch_idx, item_idx);
+    auto op_res = op(ridx[item_idx], args.data[batch_idx]);
+    atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx]));
+  });
+}
+
+struct IndexFlagTuple {
+  size_t idx;
+  bool flag;
+  size_t flag_scan;
+  int batch_idx;
+};
 
-  const auto& segment = args.segments_batch[blockIdx.x];
-  const auto& data = args.data_batch[blockIdx.x];
-  const auto& ridx_segment = ridx.subspan(segment.begin, segment.Size());
-  const auto& position_segment = position.subspan(segment.begin, segment.Size());
+struct IndexFlagOp {
+  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
+    if (a.batch_idx == b.batch_idx) {
+      return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx};
+    } else {
+      return b;
+    }
+  }
+};
 
-  auto left_nidx = args.left_nidx_batch[blockIdx.x];
-  auto left_count = dh::BlockPartition<kBlockSize>().Partition(
-      ridx_segment.data(), ridx_segment.data() + ridx_segment.size(),
-      [&] __device__(auto e) { return op(e, data) == left_nidx; });
+template<typename OpDataT,typename OpT>
+struct WriteResultsFunctor {
+  KernelArgs<OpDataT> args;
+  OpT op;
+  common::Span<bst_uint> ridx_in;
+  common::Span<bst_uint> ridx_out;
+  common::Span<unsigned long long int> left_counts;
 
-  if (threadIdx.x == 0) {
-    left_counts[blockIdx.x] = left_count;
+  __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
+    // the ex_scan_result represents how many rows have been assigned to left
+    // node so far during scan.
+    std::size_t scatter_address;
+    if (x.flag) {
+      scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1;  // -1 because inclusive scan
+    } else {
+      // current number of rows belong to right node + total number of rows
+      // belong to left node
+      scatter_address  = (x.idx - x.flag_scan) + left_counts[x.batch_idx];
+    }
+    ridx_out[scatter_address] = ridx_in[x.idx];
+    // Discard
+    return {};
   }
+};
+
+template <typename RowIndexT, typename OpT, typename OpDataT>
+void SortPositionBatch(const KernelArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
+                       common::Span<RowIndexT> ridx_tmp,
+                       common::Span<unsigned long long int> left_counts, OpT op,
+                       cudaStream_t stream) {
+  WriteResultsFunctor<OpDataT,OpT> write_results{args,op,ridx, ridx_tmp, left_counts};
+  auto discard_write_iterator =
+      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto input_iterator =
+      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
+        int batch_idx;
+        std::size_t item_idx;
+        args.AssignBatch(idx, batch_idx, item_idx);
+        auto go_left = op(ridx[item_idx], args.data[batch_idx]);
+        return IndexFlagTuple{item_idx, go_left,go_left, batch_idx};
+      });
+  size_t temp_bytes = 0;
+  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
+                                 discard_write_iterator, IndexFlagOp(),
+                                 args.TotalRows(), stream);
+  dh::TemporaryArray<int8_t> temp(temp_bytes);
+  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
+                                 discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
+
+  // copy active segments back to original buffer
+  dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) {
+    // Assign this thread to a row
+    int batch_idx;
+    std::size_t item_idx;
+    args.AssignBatch(idx, batch_idx, item_idx);
+    ridx[item_idx] = ridx_tmp[item_idx];
+  });
 }
 
 /** \brief Class responsible for tracking subsets of rows as we add splits and
@@ -84,6 +171,8 @@ class RowPartitioner {
    * rows idx | 3, 5, 1 | 13, 31 |
    */
   dh::TemporaryArray<RowIndexT> ridx_;
+  // Staging area for sorting ridx
+  dh::TemporaryArray<RowIndexT> ridx_tmp_;
   /*! \brief mapping for row -> node id. */
   dh::TemporaryArray<bst_node_t> position_;
   dh::PinnedMemory pinned_;
@@ -129,31 +218,32 @@ class RowPartitioner {
     CHECK_EQ(nidx.size(), left_nidx.size());
     CHECK_EQ(nidx.size(), right_nidx.size());
     CHECK_EQ(nidx.size(), op_data.size());
-    CHECK_LE(nidx.size(), kUpdatePositionMaxBatch);
-    auto left_counts = pinned_.GetSpan<int64_t>(nidx.size(), 0);
+    CHECK_LE(nidx.size(), kMaxBatch);
+    auto h_left_counts = pinned_.GetSpan<int64_t>(nidx.size(), 0);
+    dh::TemporaryArray<unsigned long long int> d_left_counts(nidx.size(), 0);
 
     // Prepare kernel arguments
-    UpdatePositionBatchArgs<OpDataT> args;
-    std::copy(nidx.begin(),nidx.end(),args.nidx_batch);
-    std::copy(left_nidx.begin(),left_nidx.end(),args.left_nidx_batch);
-    std::copy(right_nidx.begin(),right_nidx.end(),args.right_nidx_batch);
-    std::copy(op_data.begin(),op_data.end(),args.data_batch);
-    for(int i = 0; i < nidx.size(); i++){
-      args.segments_batch[i]=ridx_segments_.at(nidx[i]);
+    KernelArgs<OpDataT> args;
+    std::copy(op_data.begin(), op_data.end(), args.data);
+    for (int i = 0; i < nidx.size(); i++) {
+      args.segments[i] = ridx_segments_.at(nidx[i]);
     }
+    GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(d_left_counts), op);
+
+    dh::safe_cuda(
+        cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(),
+                        sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(),
+                        cudaMemcpyDefault, nullptr));
 
-    // 1 block per node
-    constexpr int kBlockSize = 1024;
-    UpdatePositionBatchKernel<kBlockSize><<<nidx.size(), kBlockSize>>>(
-        args, op, dh::ToSpan(ridx_),
-        dh::ToSpan(position_), left_counts);
+    SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_left_counts), op,
+                      nullptr);
 
     dh::safe_cuda(cudaDeviceSynchronize());
 
     // Update segments
     for (int i = 0; i < nidx.size(); i++) {
       auto segment=ridx_segments_.at(nidx[i]);
-      auto left_count = left_counts[i];
+      auto left_count = h_left_counts[i];
       CHECK_LE(left_count, segment.Size());
       CHECK_GE(left_count, 0);
       ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 3905cd233aac..9b1982195fab 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -386,23 +386,17 @@ struct GPUHistMakerDevice {
           // given a row index, returns the node id it belongs to
           bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
           // Missing value
-          bst_node_t new_position = 0;
+          bool go_left = true;
           if (isnan(cut_value)) {
-            new_position = data.split_node.DefaultChild();
+            go_left = data.split_node.DefaultLeft();
           } else {
-            bool go_left = true;
             if (data.split_type == FeatureType::kCategorical) {
               go_left = common::Decision<false>(data.node_cats.Bits(), cut_value, data.split_node.DefaultLeft());
             } else {
               go_left = cut_value <= data.split_node.SplitCond();
             }
-            if (go_left) {
-              new_position = data.split_node.LeftChild();
-            } else {
-              new_position = data.split_node.RightChild();
-            }
           }
-          return new_position;
+          return go_left;
         });
   }
 
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 70412832802a..e16c1b719426 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -31,12 +31,7 @@ void TestUpdatePositionBatch() {
   // Send the first five training instances to the right node
   // and the second 5 to the left node
   rp.UpdatePositionBatch({0}, {1}, {2}, extra_data, [=] __device__(RowPartitioner::RowIndexT ridx, int) {
-    if (ridx > 4) {
-      return 1;
-    }
-    else {
-      return 2;
-    }
+    return ridx > 4;
   });
   rows = rp.GetRowsHost(1);
   for (auto r : rows) {
@@ -49,10 +44,7 @@ void TestUpdatePositionBatch() {
 
   // Split the left node again
   rp.UpdatePositionBatch({1}, {3}, {4}, extra_data,[=] __device__(RowPartitioner::RowIndexT ridx, int) {
-    if (ridx < 7) {
-      return 3;
-    }
-    return 4;
+    return ridx < 7;
   });
   EXPECT_EQ(rp.GetRows(3).size(), 2);
   EXPECT_EQ(rp.GetRows(4).size(), 3);
@@ -117,126 +109,6 @@ void TestFinalise() {
 TEST(RowPartitioner, Finalise) { TestFinalise(); }
 
 
-const int kMaxBatch = 32;
-template <typename OpDataT>
-struct KernelArgs {
-  Segment segments[kMaxBatch];
-  OpDataT data[kMaxBatch];
-
-  // Given a global thread idx, assign it to an item from one of the segments
-  __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const {
-    std::size_t sum = 0;
-    for (int i = 0; i < kMaxBatch; i++) {
-      if (sum + segments[i].Size() > idx) {
-        batch_idx = i;
-        item_idx = (idx - sum) + segments[i].begin;
-        break;
-      }
-      sum += segments[i].Size();
-    }
-  }
-  std::size_t TotalRows() const {
-    std::size_t total_rows = 0;
-    for (auto segment : segments) {
-      total_rows += segment.Size();
-    }
-    return total_rows;
-  }
-};
-
-template <typename RowIndexT, typename OpT, typename OpDataT>
-void GetLeftCounts(const KernelArgs<OpDataT>&args,common::Span<RowIndexT> ridx,
-                   common::Span<unsigned long long int> d_left_counts, OpT op
-                   ) {
-
-  // Launch 1 thread for each row
-  dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) {
-    // Assign this thread to a row
-    int batch_idx;
-    std::size_t item_idx;
-    args.AssignBatch(idx, batch_idx, item_idx);
-    auto op_res = op(ridx[item_idx], args.data[batch_idx]);
-    atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx]));
-  });
-}
-
-struct IndexFlagTuple {
-  size_t idx;
-  bool flag;
-  size_t flag_scan;
-  int batch_idx;
-};
-
-struct IndexFlagOp {
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
-    if (a.batch_idx == b.batch_idx) {
-      return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx};
-    } else {
-      return b;
-    }
-  }
-};
-
-template<typename OpDataT,typename OpT>
-struct WriteResultsFunctor {
-  KernelArgs<OpDataT> args;
-  OpT op;
-  common::Span<RowPartitioner::RowIndexT> ridx_in;
-  common::Span<RowPartitioner::RowIndexT> ridx_out;
-  common::Span<unsigned long long int> left_counts;
-
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
-    // the ex_scan_result represents how many rows have been assigned to left
-    // node so far during scan.
-    std::size_t scatter_address;
-    if (x.flag) {
-      scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1;  // -1 because inclusive scan
-    } else {
-      // current number of rows belong to right node + total number of rows
-      // belong to left node
-      scatter_address  = (x.idx - x.flag_scan) + left_counts[x.batch_idx];
-    }
-    ridx_out[scatter_address] = ridx_in[x.idx];
-    // Discard
-    return {};
-  }
-};
-
-template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatch(const KernelArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
-                       common::Span<RowIndexT> ridx_tmp,
-                       common::Span<unsigned long long int> left_counts, OpT op,
-                       cudaStream_t stream) {
-  WriteResultsFunctor<OpDataT,OpT> write_results{args,op,ridx, ridx_tmp, left_counts};
-  auto discard_write_iterator =
-      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
-  auto counting = thrust::make_counting_iterator(0llu);
-  auto input_iterator =
-      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
-        int batch_idx;
-        std::size_t item_idx;
-        args.AssignBatch(idx, batch_idx, item_idx);
-        auto go_left = op(ridx[item_idx], args.data[batch_idx]);
-        return IndexFlagTuple{item_idx, go_left,go_left, batch_idx};
-      });
-  size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(),
-                                 args.TotalRows(), stream);
-  dh::TemporaryArray<int8_t> temp(temp_bytes);
-  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
-
-  // copy active segments back to original buffer
-  dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) {
-    // Assign this thread to a row
-    int batch_idx;
-    std::size_t item_idx;
-    args.AssignBatch(idx, batch_idx, item_idx);
-    ridx[item_idx] = ridx_tmp[item_idx];
-  });
-}
-
 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
   thrust::device_vector<uint32_t> ridx = ridx_in;
   thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());

From 199bed96318ae7e651432aeb94b64037fac8b74d Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 19 May 2022 03:45:38 -0700
Subject: [PATCH 26/64] Optimised

---
 src/tree/gpu_hist/row_partitioner.cu          | 13 ++-
 src/tree/gpu_hist/row_partitioner.cuh         | 97 +++++++++++--------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  5 +-
 3 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 91933c6d2896..e13f6a8eb188 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -20,11 +20,22 @@ void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
 }
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),position_(num_rows) {
+    : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_tmp_(num_rows),position_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(Segment(0, num_rows));
 
   Reset(device_idx, dh::ToSpan(ridx_), dh::ToSpan(position_));
+  streams_.resize(2);
+  for (auto& stream : streams_) {
+    dh::safe_cuda(cudaStreamCreate(&stream));
+  }
+}
+
+RowPartitioner::~RowPartitioner() {
+  dh::safe_cuda(cudaSetDevice(device_idx_));
+  for (auto& stream : streams_) {
+    dh::safe_cuda(cudaStreamDestroy(stream));
+  }
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 6de8ddff3993..9d00e3528307 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -34,9 +34,9 @@ struct KernelArgs {
   OpDataT data[kMaxBatch];
 
   // Given a global thread idx, assign it to an item from one of the segments
-  __device__ void AssignBatch(std::size_t idx, int &batch_idx, std::size_t &item_idx) const {
+  __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) const {
     std::size_t sum = 0;
-    for (int i = 0; i < kMaxBatch; i++) {
+    for (int16_t i = 0; i < kMaxBatch; i++) {
       if (sum + segments[i].Size() > idx) {
         batch_idx = i;
         item_idx = (idx - sum) + segments[i].begin;
@@ -54,53 +54,73 @@ struct KernelArgs {
   }
 };
 
+// Should be 16 bytes aligned
+struct IndexFlagTuple {
+  bst_uint idx;
+  bst_uint flag_scan;
+  bst_uint segment_start;
+  int16_t batch_idx;
+  bool flag;
+};
+
+/*! \brief Count how many rows are assigned to left node. */
+__forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment,
+                                                int batch_idx) {
+  int mask = __activemask();
+  bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, 0));
+  // If all threads here are working on the same node
+  // we can do a more efficient reduction with warp intrinsics
+  if (group_is_contiguous) {
+    unsigned ballot = __ballot_sync(mask, increment);
+    int leader = __ffs(mask) - 1;
+    if (threadIdx.x % 32 == leader) {
+      atomicAdd(d_counts + batch_idx,  // NOLINT
+                __popc(ballot));   // NOLINT
+    }
+  } else {
+    atomicAdd(d_counts + batch_idx, increment);
+  }
+}
+
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void GetLeftCounts(const KernelArgs<OpDataT>&args,common::Span<RowIndexT> ridx,
+void GetLeftCounts(const KernelArgs<OpDataT>&args,common::Span<RowIndexT> ridx,common::Span<IndexFlagTuple> scan_tmp,
                    common::Span<unsigned long long int> d_left_counts, OpT op
                    ) {
 
   // Launch 1 thread for each row
   dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) {
     // Assign this thread to a row
-    int batch_idx;
+    int16_t batch_idx;
     std::size_t item_idx;
     args.AssignBatch(idx, batch_idx, item_idx);
     auto op_res = op(ridx[item_idx], args.data[batch_idx]);
-    atomicAdd(&d_left_counts[batch_idx], op(ridx[item_idx], args.data[batch_idx]));
+    scan_tmp[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
+
+    AtomicIncrement(d_left_counts.data(),op(ridx[item_idx], args.data[batch_idx]), batch_idx);
   });
 }
 
-struct IndexFlagTuple {
-  size_t idx;
-  bool flag;
-  size_t flag_scan;
-  int batch_idx;
-};
-
 struct IndexFlagOp {
   __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
     if (a.batch_idx == b.batch_idx) {
-      return {b.idx, b.flag, a.flag_scan + b.flag_scan, b.batch_idx};
+      return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag};
     } else {
       return b;
     }
   }
 };
 
-template<typename OpDataT,typename OpT>
 struct WriteResultsFunctor {
-  KernelArgs<OpDataT> args;
-  OpT op;
-  common::Span<bst_uint> ridx_in;
-  common::Span<bst_uint> ridx_out;
-  common::Span<unsigned long long int> left_counts;
+  bst_uint* ridx_in;
+  bst_uint* ridx_out;
+  unsigned long long int* left_counts;
 
   __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
     // the ex_scan_result represents how many rows have been assigned to left
     // node so far during scan.
     std::size_t scatter_address;
     if (x.flag) {
-      scatter_address = args.segments[x.batch_idx].begin + x.flag_scan - 1;  // -1 because inclusive scan
+      scatter_address = x.segment_start + x.flag_scan - 1;  // -1 because inclusive scan
     } else {
       // current number of rows belong to right node + total number of rows
       // belong to left node
@@ -114,35 +134,25 @@ struct WriteResultsFunctor {
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatch(const KernelArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
-                       common::Span<RowIndexT> ridx_tmp,
+                       common::Span<RowIndexT> ridx_tmp, common::Span<IndexFlagTuple> scan_tmp,
                        common::Span<unsigned long long int> left_counts, OpT op,
                        cudaStream_t stream) {
-  WriteResultsFunctor<OpDataT,OpT> write_results{args,op,ridx, ridx_tmp, left_counts};
+  static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned.");
+  WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()};
   auto discard_write_iterator =
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
-  auto input_iterator =
-      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
-        int batch_idx;
-        std::size_t item_idx;
-        args.AssignBatch(idx, batch_idx, item_idx);
-        auto go_left = op(ridx[item_idx], args.data[batch_idx]);
-        return IndexFlagTuple{item_idx, go_left,go_left, batch_idx};
-      });
   size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
+  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_tmp.data(),
                                  discard_write_iterator, IndexFlagOp(),
                                  args.TotalRows(), stream);
   dh::TemporaryArray<int8_t> temp(temp_bytes);
-  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
+  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes,  scan_tmp.data(),
                                  discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
 
   // copy active segments back to original buffer
   dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) {
-    // Assign this thread to a row
-    int batch_idx;
-    std::size_t item_idx;
-    args.AssignBatch(idx, batch_idx, item_idx);
+    auto item_idx = scan_tmp[idx].idx;
     ridx[item_idx] = ridx_tmp[item_idx];
   });
 }
@@ -173,12 +183,15 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
+  dh::TemporaryArray<IndexFlagTuple> scan_tmp_;
   /*! \brief mapping for row -> node id. */
   dh::TemporaryArray<bst_node_t> position_;
   dh::PinnedMemory pinned_;
+  std::vector<cudaStream_t> streams_;
 
  public:
   RowPartitioner(int device_idx, size_t num_rows);
+   ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
 
@@ -228,21 +241,21 @@ class RowPartitioner {
     for (int i = 0; i < nidx.size(); i++) {
       args.segments[i] = ridx_segments_.at(nidx[i]);
     }
-    GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(d_left_counts), op);
+    GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_tmp_), dh::ToSpan(d_left_counts), op);
 
     dh::safe_cuda(
         cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(),
                         sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(),
-                        cudaMemcpyDefault, nullptr));
+                        cudaMemcpyDefault, streams_[0]));
 
-    SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_left_counts), op,
-                      nullptr);
+    SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_tmp_),
+                      dh::ToSpan(d_left_counts), op, streams_[1]);
 
-    dh::safe_cuda(cudaDeviceSynchronize());
+    dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
 
     // Update segments
     for (int i = 0; i < nidx.size(); i++) {
-      auto segment=ridx_segments_.at(nidx[i]);
+      auto segment = ridx_segments_.at(nidx[i]);
       auto left_count = h_left_counts[i];
       CHECK_LE(left_count, segment.Size());
       CHECK_GE(left_count, 0);
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index e16c1b719426..5ad2dbc3fc3a 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -113,14 +113,15 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   thrust::device_vector<uint32_t> ridx = ridx_in;
   thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
   thrust::device_vector<unsigned long long int> left_counts(segments.size());
+  thrust::device_vector<IndexFlagTuple> scan_tmp(ridx_in.size());
 
   auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
   KernelArgs<int> args;
   std::copy(segments.begin(), segments.end(), args.segments);
   std::copy(op_data.begin(), op_data.end(), args.data);
-  GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(left_counts), op);
-  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(left_counts), op,
+  GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(scan_tmp),dh::ToSpan(left_counts), op);
+  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), op,
                     nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };

From 0e35e9949ccf2ecb556be7092679d537bd8c14cf Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 19 May 2022 06:40:05 -0700
Subject: [PATCH 27/64] Do not initialise data structures to maximum possible
 tree size.

---
 src/tree/split_evaluator.h   | 15 +++++++++++++--
 src/tree/updater_gpu_hist.cu | 17 +++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
index 8cdf88834559..ee24f11ccaa0 100644
--- a/src/tree/split_evaluator.h
+++ b/src/tree/split_evaluator.h
@@ -13,6 +13,7 @@
 #include <utility>
 #include <vector>
 #include <limits>
+#include <algorithm>
 
 #include "xgboost/tree_model.h"
 #include "xgboost/host_device_vector.h"
@@ -49,8 +50,9 @@ class TreeEvaluator {
     } else {
       monotone_.HostVector() = p.monotone_constraints;
       monotone_.HostVector().resize(n_features, 0);
-      lower_bounds_.Resize(p.MaxNodes(), -std::numeric_limits<float>::max());
-      upper_bounds_.Resize(p.MaxNodes(), std::numeric_limits<float>::max());
+      // Initialised to some small size, can grow if needed
+      lower_bounds_.Resize(256, -std::numeric_limits<float>::max());
+      upper_bounds_.Resize(256, std::numeric_limits<float>::max());
       has_constraint_ = true;
     }
 
@@ -157,6 +159,15 @@ class TreeEvaluator {
     if (!has_constraint_) {
       return;
     }
+
+    auto max_nidx = std::max(leftid, rightid);
+    if (lower_bounds_.Size() <= max_nidx) {
+      lower_bounds_.Resize(max_nidx * 2 + 1);
+    }
+    if (upper_bounds_.Size() <= max_nidx) {
+      upper_bounds_.Resize(max_nidx * 2 + 1);
+    }
+
     common::Transform<>::Init(
         [=] XGBOOST_DEVICE(size_t, common::Span<float> lower,
                            common::Span<float> upper,
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 9b1982195fab..e72de08b203b 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -223,7 +223,7 @@ struct GPUHistMakerDevice {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
       monotone_constraints = param.monotone_constraints;
     }
-    node_sum_gradients.resize(param.MaxNodes());
+    node_sum_gradients.resize(256);
 
     // Init histogram
     hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
@@ -625,12 +625,17 @@ struct GPUHistMakerDevice {
     }
     evaluator_.ApplyTreeSplit(candidate, p_tree);
 
-    node_sum_gradients[tree[candidate.nid].LeftChild()] = candidate.split.left_sum;
-    node_sum_gradients[tree[candidate.nid].RightChild()] = candidate.split.right_sum;
+    const auto& parent = tree[candidate.nid];
+    std::size_t max_nidx = std::max(parent.LeftChild(), parent.RightChild());
+    // Grow as needed
+    if (node_sum_gradients.size() <= max_nidx) {
+      node_sum_gradients.resize(max_nidx * 2 + 1);
+    }
+    node_sum_gradients[parent.LeftChild()] = candidate.split.left_sum;
+    node_sum_gradients[parent.RightChild()] = candidate.split.right_sum;
 
-    interaction_constraints.Split(candidate.nid, tree[candidate.nid].SplitIndex(),
-                                  tree[candidate.nid].LeftChild(),
-                                  tree[candidate.nid].RightChild());
+    interaction_constraints.Split(candidate.nid, parent.SplitIndex(), parent.LeftChild(),
+                                  parent.RightChild());
   }
 
   GPUExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {

From daa9b56fa41e9517ddb8edcb9f2d8afd2feb5394 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 19 May 2022 13:15:38 -0700
Subject: [PATCH 28/64] Comments, cleanup

---
 src/tree/gpu_hist/row_partitioner.cu          |   2 +-
 src/tree/gpu_hist/row_partitioner.cuh         | 157 ++++++++++--------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |   4 +-
 3 files changed, 94 insertions(+), 69 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index e13f6a8eb188..86642ab7170e 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -20,7 +20,7 @@ void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
 }
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_tmp_(num_rows),position_(num_rows) {
+    : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_inputs_(num_rows),position_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(Segment(0, num_rows));
 
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 9d00e3528307..1cf753a59894 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -27,9 +27,9 @@ struct Segment {
 };
 
 
-const int kMaxBatch = 32;
 template <typename OpDataT>
-struct KernelArgs {
+struct KernelBatchArgs {
+  static const int kMaxBatch = 8;
   Segment segments[kMaxBatch];
   OpDataT data[kMaxBatch];
 
@@ -54,15 +54,29 @@ struct KernelArgs {
   }
 };
 
-// Should be 16 bytes aligned
+// We can scan over this tuple, where the scan gives us information on how to partition inputs
+// according to the flag
 struct IndexFlagTuple {
-  bst_uint idx;
-  bst_uint flag_scan;
-  bst_uint segment_start;
-  int16_t batch_idx;
-  bool flag;
+  bst_uint idx;            // The location of the item we are working on in ridx_
+  bst_uint flag_scan;      // This gets populated after scanning
+  bst_uint segment_start;  // Start offset of this node segment
+  int16_t batch_idx;       // Which node in the batch does this item belong to
+  bool flag;               // Result of op (is this item going left?)
 };
 
+struct IndexFlagOp {
+  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
+    // Segmented scan - resets if we cross batch boundaries
+    if (a.batch_idx == b.batch_idx) {
+      // Accumulate the flags, everything else stays the same
+      return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag};
+    } else {
+      return b;
+    }
+  }
+};
+
+
 /*! \brief Count how many rows are assigned to left node. */
 __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment,
                                                 int batch_idx) {
@@ -83,10 +97,9 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo
 }
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void GetLeftCounts(const KernelArgs<OpDataT>&args,common::Span<RowIndexT> ridx,common::Span<IndexFlagTuple> scan_tmp,
-                   common::Span<unsigned long long int> d_left_counts, OpT op
-                   ) {
-
+void GetLeftCounts(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
+                   common::Span<IndexFlagTuple> scan_inputs,
+                   common::Span<unsigned long long int> d_left_counts, OpT op) {
   // Launch 1 thread for each row
   dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) {
     // Assign this thread to a row
@@ -94,22 +107,18 @@ void GetLeftCounts(const KernelArgs<OpDataT>&args,common::Span<RowIndexT> ridx,c
     std::size_t item_idx;
     args.AssignBatch(idx, batch_idx, item_idx);
     auto op_res = op(ridx[item_idx], args.data[batch_idx]);
-    scan_tmp[idx] = IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
+    scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res,
+                                      bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
 
-    AtomicIncrement(d_left_counts.data(),op(ridx[item_idx], args.data[batch_idx]), batch_idx);
+    AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], args.data[batch_idx]), batch_idx);
   });
 }
 
-struct IndexFlagOp {
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
-    if (a.batch_idx == b.batch_idx) {
-      return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag};
-    } else {
-      return b;
-    }
-  }
-};
-
+// This is a transformer output iterator
+// It takes the result of the scan and performs the partition
+// To understand how a scan is used to partition elements see:
+// Harris, Mark, Shubhabrata Sengupta, and John D. Owens. "Parallel prefix sum (scan) with CUDA."
+// GPU gems 3.39 (2007): 851-876.
 struct WriteResultsFunctor {
   bst_uint* ridx_in;
   bst_uint* ridx_out;
@@ -132,10 +141,10 @@ struct WriteResultsFunctor {
   }
 };
 
-template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatch(const KernelArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
-                       common::Span<RowIndexT> ridx_tmp, common::Span<IndexFlagTuple> scan_tmp,
-                       common::Span<unsigned long long int> left_counts, OpT op,
+template <typename RowIndexT, typename OpDataT>
+void SortPositionBatch(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
+                       common::Span<RowIndexT> ridx_tmp, common::Span<IndexFlagTuple> scan_inputs,
+                       common::Span<unsigned long long int> left_counts,
                        cudaStream_t stream) {
   static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned.");
   WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()};
@@ -143,16 +152,16 @@ void SortPositionBatch(const KernelArgs<OpDataT>& args, common::Span<RowIndexT>
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
   size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_tmp.data(),
+  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_inputs.data(),
                                  discard_write_iterator, IndexFlagOp(),
                                  args.TotalRows(), stream);
   dh::TemporaryArray<int8_t> temp(temp_bytes);
-  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes,  scan_tmp.data(),
+  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes,  scan_inputs.data(),
                                  discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
 
   // copy active segments back to original buffer
   dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) {
-    auto item_idx = scan_tmp[idx].idx;
+    auto item_idx = scan_inputs[idx].idx;
     ridx[item_idx] = ridx_tmp[item_idx];
   });
 }
@@ -183,7 +192,7 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
-  dh::TemporaryArray<IndexFlagTuple> scan_tmp_;
+  dh::TemporaryArray<IndexFlagTuple> scan_inputs_;
   /*! \brief mapping for row -> node id. */
   dh::TemporaryArray<bst_node_t> position_;
   dh::PinnedMemory pinned_;
@@ -226,43 +235,59 @@ class RowPartitioner {
                            const std::vector<bst_node_t>& right_nidx,
                            const std::vector<OpDataT>& op_data, UpdatePositionOpT op) {
     if (nidx.empty()) return;
-    // Impose this limit because we are passing arguments for each node to the kernel by parameter
-    // this avoids memcpy but we cannot pass arbitrary number of arguments
     CHECK_EQ(nidx.size(), left_nidx.size());
     CHECK_EQ(nidx.size(), right_nidx.size());
     CHECK_EQ(nidx.size(), op_data.size());
-    CHECK_LE(nidx.size(), kMaxBatch);
-    auto h_left_counts = pinned_.GetSpan<int64_t>(nidx.size(), 0);
-    dh::TemporaryArray<unsigned long long int> d_left_counts(nidx.size(), 0);
-
-    // Prepare kernel arguments
-    KernelArgs<OpDataT> args;
-    std::copy(op_data.begin(), op_data.end(), args.data);
-    for (int i = 0; i < nidx.size(); i++) {
-      args.segments[i] = ridx_segments_.at(nidx[i]);
-    }
-    GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_tmp_), dh::ToSpan(d_left_counts), op);
-
-    dh::safe_cuda(
-        cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(),
-                        sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(),
-                        cudaMemcpyDefault, streams_[0]));
-
-    SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_tmp_),
-                      dh::ToSpan(d_left_counts), op, streams_[1]);
-
-    dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
-
-    // Update segments
-    for (int i = 0; i < nidx.size(); i++) {
-      auto segment = ridx_segments_.at(nidx[i]);
-      auto left_count = h_left_counts[i];
-      CHECK_LE(left_count, segment.Size());
-      CHECK_GE(left_count, 0);
-      ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
-                                     std::max(left_nidx[i], right_nidx[i]) + 1));
-      ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count);
-      ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end);
+
+    // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying
+    // memory from device to host
+    for (std::size_t batch_start = 0; batch_start < nidx.size();
+         batch_start += KernelBatchArgs<OpDataT>::kMaxBatch) {
+      // Temporary arrays
+      auto h_left_counts = pinned_.GetSpan<int64_t>(KernelBatchArgs<OpDataT>::kMaxBatch, 0);
+      dh::TemporaryArray<unsigned long long int> d_left_counts(KernelBatchArgs<OpDataT>::kMaxBatch, 0);
+
+      std::size_t batch_end = std::min(batch_start + KernelBatchArgs<OpDataT>::kMaxBatch, nidx.size());
+      // Prepare kernel arguments
+      KernelBatchArgs<OpDataT> args;
+      std::copy(op_data.begin() + batch_start, op_data.begin() + batch_end, args.data);
+      for (int i = 0; i < (batch_end - batch_start); i++) {
+        args.segments[i] = ridx_segments_.at(nidx[batch_start + i]);
+      }
+
+      // Evaluate the operator for each row, where true means 'go left'
+      // Store the result of the operator for the next step
+      // Count the number of rows going left, store in d_left_counts
+      GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op);
+
+      // Start copying the counts to the host
+      // We overlap this transfer with the sort step using streams
+      // We only need the result after sorting to update the segment boundaries
+      dh::safe_cuda(
+          cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(),
+                          sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(),
+                          cudaMemcpyDefault, streams_[0]));
+
+      // Partition the rows according to the operator
+      SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_inputs_),
+                        dh::ToSpan(d_left_counts), streams_[1]);
+
+      dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
+
+      // Update segments
+      for (int i = 0; i < (batch_end - batch_start); i++) {
+        auto segment = ridx_segments_.at(nidx[batch_start + i]);
+        auto left_count = h_left_counts[i];
+        CHECK_LE(left_count, segment.Size());
+        CHECK_GE(left_count, 0);
+        ridx_segments_.resize(
+            std::max(static_cast<bst_node_t>(ridx_segments_.size()),
+                     std::max(left_nidx[batch_start + i], right_nidx[batch_start + i]) + 1));
+        ridx_segments_[left_nidx[batch_start + i]] =
+            Segment(segment.begin, segment.begin + left_count);
+        ridx_segments_[right_nidx[batch_start + i]] =
+            Segment(segment.begin + left_count, segment.end);
+      }
     }
   }
 
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 5ad2dbc3fc3a..22f17248b5f6 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -117,11 +117,11 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
 
   auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
-  KernelArgs<int> args;
+  KernelBatchArgs<int> args;
   std::copy(segments.begin(), segments.end(), args.segments);
   std::copy(op_data.begin(), op_data.end(), args.data);
   GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(scan_tmp),dh::ToSpan(left_counts), op);
-  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), op,
+  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), 
                     nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };

From 8ab989e881f2b0dd133ab1f07267e3cc3d4ae5a7 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 20 May 2022 03:08:58 -0700
Subject: [PATCH 29/64] Refactor FinalizePosition

---
 src/tree/gpu_hist/row_partitioner.cuh | 16 +----
 src/tree/updater_gpu_hist.cu          | 98 ++++++++++++---------------
 2 files changed, 43 insertions(+), 71 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 1cf753a59894..20315719490b 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -171,7 +171,6 @@ void SortPositionBatch(const KernelBatchArgs<OpDataT>& args, common::Span<RowInd
 class RowPartitioner {
  public:
   using RowIndexT = bst_uint;
-  static constexpr bst_node_t kIgnoredTreePosition = -1;
 
  private:
   int device_idx_;
@@ -310,19 +309,6 @@ class RowPartitioner {
                         Sampledp sampledp) {
     auto d_position = position_.data().get();
     const auto d_ridx = ridx_.data().get();
-    if (!task.UpdateTreeLeaf()) {
-      dh::LaunchN(position_.size(), [=] __device__(size_t idx) {
-        auto position = d_position[idx];
-        RowIndexT ridx = d_ridx[idx];
-        bst_node_t new_position = op(ridx, position);
-        if (new_position == kIgnoredTreePosition) {
-          return;
-        }
-        d_position[idx] = new_position;
-      });
-      return;
-    }
-
     p_out_position->SetDevice(ctx->gpu_id);
     p_out_position->Resize(position_.size());
     auto sorted_position = p_out_position->DevicePointer();
@@ -331,7 +317,7 @@ class RowPartitioner {
       RowIndexT ridx = d_ridx[idx];
       bst_node_t new_position = op(ridx, position);
       sorted_position[ridx] = sampledp(ridx) ? ~new_position : new_position;
-      if (new_position == kIgnoredTreePosition) {
+      if (new_position == -1) {
         return;
       }
       d_position[idx] = new_position;
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index e72de08b203b..3d1e1ccb00ba 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -182,10 +182,11 @@ struct GPUHistMakerDevice {
   std::unique_ptr<RowPartitioner> row_partitioner;
   DeviceHistogramStorage<GradientSumT> hist{};
 
-  dh::caching_device_vector<GradientPair> d_gpair;  // storage for gpair;
+  dh::device_vector<GradientPair> d_gpair;  // storage for gpair;
   common::Span<GradientPair> gpair;
 
-  dh::caching_device_vector<int> monotone_constraints;
+  dh::device_vector<int> monotone_constraints;
+  dh::device_vector<float> update_predictions;
 
   /*! \brief Sum gradient for each node. */
   std::vector<GradientPairPrecise> node_sum_gradients;
@@ -405,6 +406,16 @@ struct GPUHistMakerDevice {
   // prediction cache
   void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task,
                         HostDeviceVector<bst_node_t>* p_out_position) {
+    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
+      // see comment in the `FinalisePositionInPage`.
+      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
+    }
+
+    // External memory will not use prediction cache
+    if (!p_fmat->SingleColBlock()) {
+      return;
+    }
+
     dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
     dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                   d_nodes.size() * sizeof(RegTree::Node),
@@ -423,25 +434,9 @@ struct GPUHistMakerDevice {
       dh::CopyToD(categories_segments, &d_categories_segments);
     }
 
-    if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) {
-      row_partitioner.reset();  // Release the device memory first before reallocating
-      row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_));
-    }
-    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
-      // see comment in the `FinalisePositionInPage`.
-      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
-    }
-    if (page->n_rows == p_fmat->Info().num_row_) {
-      FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                             dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
-                             p_out_position);
-    } else {
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
-        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                               dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
-                               p_out_position);
-      }
-    }
+    FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                           dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
+                           p_out_position);
   }
 
   void FinalisePositionInPage(EllpackPageImpl const *page,
@@ -453,13 +448,12 @@ struct GPUHistMakerDevice {
                               HostDeviceVector<bst_node_t>* p_out_position) {
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
     auto d_gpair = this->gpair;
-    row_partitioner->FinalisePosition(
-        ctx_, task, p_out_position,
-        [=] __device__(size_t row_id, int position) {
+    auto new_position_op = [=] __device__(size_t row_id) {
           // What happens if user prune the tree?
           if (!d_matrix.IsInRange(row_id)) {
-            return RowPartitioner::kIgnoredTreePosition;
+            return -1;
           }
+          int position = RegTree::kRoot;
           auto node = d_nodes[position];
 
           while (!node.IsLeaf()) {
@@ -487,41 +481,33 @@ struct GPUHistMakerDevice {
           }
 
           return position;
-        },
-        [d_gpair] __device__(size_t ridx) {
-          // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
-          // the sampler compacts the gradient vector.
-          return d_gpair[ridx].GetHess() - .0f == 0.f;
-        });
+        };
+    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->Resize(page->n_rows);
+    update_predictions.resize(page->n_rows);
+    auto d_update_predictions = dh::ToSpan(update_predictions);
+    auto sorted_position = p_out_position->DevicePointer();
+    dh::LaunchN(page->n_rows, [=] __device__(size_t idx) {
+      bst_node_t position = new_position_op(idx);
+      d_update_predictions[idx]=d_nodes[position].LeafValue();
+      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
+      // the sampler compacts the gradient vector.
+      bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
+      sorted_position[idx] = is_sampled? ~position : position;
+    });
   }
 
-  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+  bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
     CHECK(p_tree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
-    auto d_ridx = row_partitioner->GetRows();
-
-    GPUTrainingParam param_d(param);
-    dh::TemporaryArray<GradientPairPrecise> device_node_sum_gradients(node_sum_gradients.size());
-
-    dh::safe_cuda(cudaMemcpyAsync(device_node_sum_gradients.data().get(), node_sum_gradients.data(),
-                                  sizeof(GradientPairPrecise) * node_sum_gradients.size(),
-                                  cudaMemcpyHostToDevice));
-    auto d_position = row_partitioner->GetPosition();
-    auto d_node_sum_gradients = device_node_sum_gradients.data().get();
-    auto tree_evaluator = evaluator_.GetEvaluator();
-
-    auto const& h_nodes = p_tree->GetNodes();
-    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
-    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
-                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
-    auto d_nodes = dh::ToSpan(nodes);
-    dh::LaunchN(d_ridx.size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
-      bst_node_t nidx = d_position[idx];
-      auto weight = d_nodes[nidx].LeafValue();
-      out_preds_d(d_ridx[idx]) += weight;
+    auto d_update_predictions = dh::ToSpan(update_predictions);
+    if (d_update_predictions.empty()) return false;
+    CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
+    dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
+      out_preds_d(idx) += d_update_predictions[idx];
     });
-    row_partitioner.reset();
+    return true;
   }
 
   // num histograms is the number of contiguous histograms in memory to reduce over
@@ -853,9 +839,9 @@ class GPUHistMaker : public TreeUpdater {
       return false;
     }
     monitor_.Start("UpdatePredictionCache");
-    maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
+    auto result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
     monitor_.Stop("UpdatePredictionCache");
-    return true;
+    return result;
   }
 
   TrainParam param_;  // NOLINT

From d50ec4b442431f565958cbeb8eda75782337a44c Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 20 May 2022 03:21:36 -0700
Subject: [PATCH 30/64] Remove redundant functions

---
 src/tree/gpu_hist/row_partitioner.cu          | 10 ----
 src/tree/gpu_hist/row_partitioner.cuh         | 44 --------------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 57 -------------------
 3 files changed, 111 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 86642ab7170e..da78b20317f6 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -53,9 +53,6 @@ common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows() {
   return dh::ToSpan(ridx_);
 }
 
-common::Span<const bst_node_t> RowPartitioner::GetPosition() {
-  return dh::ToSpan(position_);
-}
 std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(
     bst_node_t nidx) {
   auto span = GetRows(nidx);
@@ -64,12 +61,5 @@ std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(
   return rows;
 }
 
-std::vector<bst_node_t> RowPartitioner::GetPositionHost() {
-  auto span = GetPosition();
-  std::vector<bst_node_t> position(span.size());
-  dh::CopyDeviceSpanToVector(&position, span);
-  return position;
-}
-
 };  // namespace tree
 };  // namespace xgboost
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 20315719490b..487eb1fca5ec 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -213,21 +213,11 @@ class RowPartitioner {
    */
   common::Span<const RowIndexT> GetRows();
 
-  /**
-   * \brief Gets the tree position of all training instances.
-   */
-  common::Span<const bst_node_t> GetPosition();
-
   /**
    * \brief Convenience method for testing
    */
   std::vector<RowIndexT> GetRowsHost(bst_node_t nidx);
 
-  /**
-   * \brief Convenience method for testing
-   */
-  std::vector<bst_node_t> GetPositionHost();
-
   template <typename UpdatePositionOpT, typename OpDataT>
   void UpdatePositionBatch(const std::vector<bst_node_t>& nidx,
                            const std::vector<bst_node_t>& left_nidx,
@@ -289,40 +279,6 @@ class RowPartitioner {
       }
     }
   }
-
-  /**
-   * \brief Finalise the position of all training instances after tree construction is
-   * complete. Does not update any other meta information in this data structure, so
-   * should only be used at the end of training.
-   *
-   *   When the task requires update leaf, this function will copy the node index into
-   *   p_out_position. The index is negated if it's being sampled in current iteration.
-   *
-   * \param p_out_position Node index for each row.
-   * \param op Device lambda. Should provide the row index and current position as an
-   *           argument and return the new position for this training instance.
-   * \param sampled A device lambda to inform the partitioner whether a row is sampled.
-   */
-  template <typename FinalisePositionOpT, typename Sampledp>
-  void FinalisePosition(Context const* ctx, ObjInfo task,
-                        HostDeviceVector<bst_node_t>* p_out_position, FinalisePositionOpT op,
-                        Sampledp sampledp) {
-    auto d_position = position_.data().get();
-    const auto d_ridx = ridx_.data().get();
-    p_out_position->SetDevice(ctx->gpu_id);
-    p_out_position->Resize(position_.size());
-    auto sorted_position = p_out_position->DevicePointer();
-    dh::LaunchN(position_.size(), [=] __device__(size_t idx) {
-      auto position = d_position[idx];
-      RowIndexT ridx = d_ridx[idx];
-      bst_node_t new_position = op(ridx, position);
-      sorted_position[ridx] = sampledp(ridx) ? ~new_position : new_position;
-      if (new_position == -1) {
-        return;
-      }
-      d_position[idx] = new_position;
-    });
-  }
 };
 };  // namespace tree
 };  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 22f17248b5f6..9861e79bba66 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -52,63 +52,6 @@ void TestUpdatePositionBatch() {
 
 TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
 
-void TestFinalise() {
-  const int kNumRows = 10;
-
-  ObjInfo task{ObjInfo::kRegression, false, false};
-  HostDeviceVector<bst_node_t> position;
-  Context ctx;
-  ctx.gpu_id = 0;
-
-  {
-    RowPartitioner rp(0, kNumRows);
-    rp.FinalisePosition(
-        &ctx, task, &position,
-        [=] __device__(RowPartitioner::RowIndexT ridx, int position) { return 7; },
-        [] XGBOOST_DEVICE(size_t idx) { return false; });
-
-    auto position = rp.GetPositionHost();
-    for (auto p : position) {
-      EXPECT_EQ(p, 7);
-    }
-  }
-
-  /**
-   * Test for sampling.
-   */
-  dh::device_vector<float> hess(kNumRows);
-  for (size_t i = 0; i < hess.size(); ++i) {
-    // removed rows, 0, 3, 6, 9
-    if (i % 3 == 0) {
-      hess[i] = 0;
-    } else {
-      hess[i] = i;
-    }
-  }
-
-  auto d_hess = dh::ToSpan(hess);
-
-  RowPartitioner rp(0, kNumRows);
-  rp.FinalisePosition(
-      &ctx, task, &position,
-      [] __device__(RowPartitioner::RowIndexT ridx, bst_node_t position) {
-        return ridx % 2 == 0 ? 1 : 2;
-      },
-      [d_hess] __device__(size_t ridx) { return d_hess[ridx] - 0.f == 0.f; });
-
-  auto const& h_position = position.ConstHostVector();
-  for (size_t ridx = 0; ridx < h_position.size(); ++ridx) {
-    if (ridx % 3 == 0) {
-      ASSERT_LT(h_position[ridx], 0);
-    } else {
-      ASSERT_EQ(h_position[ridx], ridx % 2 == 0 ? 1 : 2);
-    }
-  }
-}
-
-TEST(RowPartitioner, Finalise) { TestFinalise(); }
-
-
 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
   thrust::device_vector<uint32_t> ridx = ridx_in;
   thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());

From c34c3ad07fe753a608adb57fc6368f46f6ac2e39 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 20 May 2022 03:31:04 -0700
Subject: [PATCH 31/64] Lint

---
 src/tree/gpu_hist/row_partitioner.cu  | 13 +----
 src/tree/gpu_hist/row_partitioner.cuh |  2 -
 src/tree/updater_gpu_hist.cu          | 69 ++++++++++++++-------------
 3 files changed, 37 insertions(+), 47 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index da78b20317f6..b079189e6b7d 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -10,21 +10,12 @@
 
 namespace xgboost {
 namespace tree {
-void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
-           common::Span<bst_node_t> position) {
-  CHECK_EQ(ridx.size(), position.size());
-  dh::LaunchN(ridx.size(), [=] __device__(size_t idx) {
-    ridx[idx] = idx;
-    position[idx] = 0;
-  });
-}
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows),ridx_tmp_(num_rows),scan_inputs_(num_rows),position_(num_rows) {
+    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows), scan_inputs_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(Segment(0, num_rows));
-
-  Reset(device_idx, dh::ToSpan(ridx_), dh::ToSpan(position_));
+  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
   streams_.resize(2);
   for (auto& stream : streams_) {
     dh::safe_cuda(cudaStreamCreate(&stream));
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 487eb1fca5ec..17a24e9600db 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -192,8 +192,6 @@ class RowPartitioner {
   // Staging area for sorting ridx
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
   dh::TemporaryArray<IndexFlagTuple> scan_inputs_;
-  /*! \brief mapping for row -> node id. */
-  dh::TemporaryArray<bst_node_t> position_;
   dh::PinnedMemory pinned_;
   std::vector<cudaStream_t> streams_;
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 3d1e1ccb00ba..3d1c38ba51d7 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -383,7 +383,8 @@ struct GPUHistMakerDevice {
 
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
     row_partitioner->UpdatePositionBatch(
-        nidx, left_nidx, right_nidx, split_data, [=] __device__(bst_uint ridx, const NodeSplitData& data) {
+        nidx, left_nidx, right_nidx, split_data,
+        [=] __device__(bst_uint ridx, const NodeSplitData& data) {
           // given a row index, returns the node id it belongs to
           bst_float cut_value = d_matrix.GetFvalue(ridx, data.split_node.SplitIndex());
           // Missing value
@@ -392,7 +393,8 @@ struct GPUHistMakerDevice {
             go_left = data.split_node.DefaultLeft();
           } else {
             if (data.split_type == FeatureType::kCategorical) {
-              go_left = common::Decision<false>(data.node_cats.Bits(), cut_value, data.split_node.DefaultLeft());
+              go_left = common::Decision<false>(data.node_cats.Bits(), cut_value,
+                                                data.split_node.DefaultLeft());
             } else {
               go_left = cut_value <= data.split_node.SplitCond();
             }
@@ -449,39 +451,38 @@ struct GPUHistMakerDevice {
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
     auto d_gpair = this->gpair;
     auto new_position_op = [=] __device__(size_t row_id) {
-          // What happens if user prune the tree?
-          if (!d_matrix.IsInRange(row_id)) {
-            return -1;
+      // What happens if user prune the tree?
+      if (!d_matrix.IsInRange(row_id)) {
+        return -1;
+      }
+      int position = RegTree::kRoot;
+      auto node = d_nodes[position];
+
+      while (!node.IsLeaf()) {
+        bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
+        // Missing value
+        if (isnan(element)) {
+          position = node.DefaultChild();
+        } else {
+          bool go_left = true;
+          if (common::IsCat(d_feature_types, position)) {
+            auto node_cats = categories.subspan(categories_segments[position].beg,
+                                                categories_segments[position].size);
+            go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
+          } else {
+            go_left = element <= node.SplitCond();
           }
-          int position = RegTree::kRoot;
-          auto node = d_nodes[position];
-
-          while (!node.IsLeaf()) {
-            bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
-            // Missing value
-            if (isnan(element)) {
-              position = node.DefaultChild();
-            } else {
-              bool go_left = true;
-              if (common::IsCat(d_feature_types, position)) {
-                auto node_cats =
-                    categories.subspan(categories_segments[position].beg,
-                                       categories_segments[position].size);
-                go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
-              } else {
-                go_left = element <= node.SplitCond();
-              }
-              if (go_left) {
-                position = node.LeftChild();
-              } else {
-                position = node.RightChild();
-              }
-            }
-            node = d_nodes[position];
+          if (go_left) {
+            position = node.LeftChild();
+          } else {
+            position = node.RightChild();
           }
+        }
+        node = d_nodes[position];
+      }
 
-          return position;
-        };
+      return position;
+    }; // NOLINT
     p_out_position->SetDevice(ctx_->gpu_id);
     p_out_position->Resize(page->n_rows);
     update_predictions.resize(page->n_rows);
@@ -489,11 +490,11 @@ struct GPUHistMakerDevice {
     auto sorted_position = p_out_position->DevicePointer();
     dh::LaunchN(page->n_rows, [=] __device__(size_t idx) {
       bst_node_t position = new_position_op(idx);
-      d_update_predictions[idx]=d_nodes[position].LeafValue();
+      d_update_predictions[idx] = d_nodes[position].LeafValue();
       // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
       // the sampler compacts the gradient vector.
       bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
-      sorted_position[idx] = is_sampled? ~position : position;
+      sorted_position[idx] = is_sampled ? ~position : position;
     });
   }
 

From 47bfc6e3e34dea03368d318225ba493d275ad0c8 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 20 May 2022 03:35:20 -0700
Subject: [PATCH 32/64] Remove old kernel

---
 src/common/device_helpers.cuh | 74 -----------------------------------
 1 file changed, 74 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 3fdb994fce3d..334e3b4f89bf 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1639,78 +1639,4 @@ class CUDAStream {
   CUDAStreamView View() const { return CUDAStreamView{stream_}; }
   void Sync() { this->View().Sync(); }
 };
-
-struct PartitionScanPair {
-  int left;
-  int right;
-};
-
-inline __device__ PartitionScanPair operator+(const PartitionScanPair& a, const PartitionScanPair& b) {
-  PartitionScanPair c{a.left + b.left, a.right + b.right};
-  return c;
-}
-
-template <int kBlockSize>
-class BlockPartition{
- public:
-  template <typename IterT, typename OpT,int kItemsPerThread=4>
-  __device__ int Partition(IterT begin, IterT end, OpT op) {
-    typedef cub::BlockScan<PartitionScanPair, kBlockSize> BlockScanT;
-    __shared__ typename BlockScanT::TempStorage temp;
-
-    __shared__ int16_t lcomp[kBlockSize*kItemsPerThread];
-    __shared__ int16_t rcomp[kBlockSize*kItemsPerThread];
-    __shared__ unsigned long long int tmp_sum;
-
-    if (threadIdx.x == 0) {
-      tmp_sum = 0;
-    }
-    __syncthreads();
-
-    // Get left count
-    int count = end - begin;
-    int left_count = 0;
-    for (auto idx : dh::BlockStrideRange(int(0), count)) {
-      left_count += op(begin[idx]);
-    }
-    atomicAdd(&tmp_sum, left_count);
-    __syncthreads();
-    left_count = tmp_sum;
-
-    int loffset = 0, part = left_count, roffset = part;
-    auto tid = threadIdx.x;
-    while (loffset < part && roffset < count) {
-      // find the samples in the left that belong to right and vice-versa
-      auto loff = loffset + tid * kItemsPerThread, roff = roffset + tid * kItemsPerThread;
-
-      PartitionScanPair flag[kItemsPerThread];
-      for (int i = 0; i < kItemsPerThread; i++) {
-        flag[i].left = loff + i < part ? !op(begin[loff + i]) : 0;
-        flag[i].right = roff + i < count ? op(begin[roff + i]) : 0;
-      }
-      // scan to compute the locations for each 'misfit' in the two partitions
-      PartitionScanPair partial_sum[kItemsPerThread];
-      PartitionScanPair sum;
-      BlockScanT(temp).ExclusiveSum(flag, partial_sum, sum);
-      int minlen = sum.left < sum.right ? sum.left : sum.right;
-      // compaction to figure out the right locations to swap
-      for (int i = 0; i < kItemsPerThread; i++) {
-        if (flag[i].left) lcomp[partial_sum[i].left] = tid * kItemsPerThread+i;
-        if (flag[i].right) rcomp[partial_sum[i].right] = tid * kItemsPerThread+i;
-      }
-      __syncthreads();
-
-      // swap the 'misfit's
-      for (int i = tid; i < minlen; i += kBlockSize) {
-        auto a = begin[lcomp[i] + loffset];
-        auto b = begin[rcomp[i] + roffset];
-        begin[lcomp[i] + loffset] = b;
-        begin[rcomp[i] + roffset] = a;
-      }
-      loffset = sum.left == minlen ? loffset + kBlockSize * kItemsPerThread : loffset + lcomp[minlen];
-      roffset = sum.right == minlen ? roffset + kBlockSize * kItemsPerThread : roffset + rcomp[minlen];
-    }
-    return left_count;
-  }
-};
 }  // namespace dh

From a53ba8726e4cf03044dea995d1254b15b9af5f12 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 20 May 2022 04:03:07 -0700
Subject: [PATCH 33/64] Add tests for AtomicIncrement

---
 src/tree/gpu_hist/row_partitioner.cuh         |  8 +-
 tests/cpp/common/test_device_helpers.cu       | 79 -------------------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 22 ++++++
 3 files changed, 26 insertions(+), 83 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 17a24e9600db..dc7605305912 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -29,7 +29,7 @@ struct Segment {
 
 template <typename OpDataT>
 struct KernelBatchArgs {
-  static const int kMaxBatch = 8;
+  static const int kMaxBatch = 32;
   Segment segments[kMaxBatch];
   OpDataT data[kMaxBatch];
 
@@ -81,12 +81,12 @@ struct IndexFlagOp {
 __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment,
                                                 int batch_idx) {
   int mask = __activemask();
-  bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, 0));
+  int leader = __ffs(mask) - 1;
+  bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader));
   // If all threads here are working on the same node
   // we can do a more efficient reduction with warp intrinsics
   if (group_is_contiguous) {
     unsigned ballot = __ballot_sync(mask, increment);
-    int leader = __ffs(mask) - 1;
     if (threadIdx.x % 32 == leader) {
       atomicAdd(d_counts + batch_idx,  // NOLINT
                 __popc(ballot));   // NOLINT
@@ -197,7 +197,7 @@ class RowPartitioner {
 
  public:
   RowPartitioner(int device_idx, size_t num_rows);
-   ~RowPartitioner();
+  ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
 
diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
index 18fdb5b7eb34..6e8668bd2581 100644
--- a/tests/cpp/common/test_device_helpers.cu
+++ b/tests/cpp/common/test_device_helpers.cu
@@ -4,7 +4,6 @@
 #include <cstddef>
 #include <cstdint>
 #include <thrust/device_vector.h>
-#include <thrust/random.h>
 #include <vector>
 #include <xgboost/base.h>
 #include "../../../src/common/device_helpers.cuh"
@@ -265,82 +264,4 @@ void TestAtomicAdd() {
 TEST(AtomicAdd, Int64) {
   TestAtomicAdd();
 }
-
-template <int kBlockSize, typename OpT>
-__global__ void TestBlockPartitionKernel(int* begin, int* end, std::size_t* count_out, OpT op) {
-  auto count = dh::BlockPartition<kBlockSize>().Partition(begin, end, op);
-  if (threadIdx.x == 0) {
-    *count_out = count;
-  }
-}
-
-template <int kBlockSize>
-void TestBlockPartition(thrust::device_vector<int>& x) {
-  thrust::device_vector<std::size_t> count(1);
-
-  auto op = [] __device__(int y) { return y % 2 == 0; };
-  TestBlockPartitionKernel<kBlockSize>
-      <<<1, kBlockSize>>>(x.data().get(), x.data().get() + x.size(), count.data().get(), op);
-
-  auto reference = thrust::count_if(x.begin(), x.end(), op);
-  EXPECT_EQ(count[0], reference);
-
-  auto left_partition_count = thrust::count_if(x.begin(), x.begin() + count[0], op);
-  EXPECT_EQ(count[0], left_partition_count);
-  auto right_partition_count = thrust::count_if(x.begin() + count[0], x.end(), op);
-  EXPECT_EQ(0, right_partition_count);
-}
-
-TEST(BlockPartition, BlockPartitionEmpty) {
-  thrust::device_vector<int> x;
-  TestBlockPartition<256>(x);
-}
-
-TEST(BlockPartition, BlockPartitionUniform) {
-  thrust::device_vector<int> x(100);
-  TestBlockPartition<256>(x);
-  thrust::fill(x.begin(),x.end(),1);
-  TestBlockPartition<256>(x);
-}
-
-void MakeRandom(thrust::device_vector<int>& x, int seed) {
-  auto counting = thrust::make_counting_iterator(0);
-  thrust::transform(counting, counting + x.size(), x.begin(), [=] __device__(auto idx) {
-    thrust::default_random_engine gen(seed);
-    thrust::uniform_int_distribution<int> dist;
-    gen.discard(idx);
-    return dist(gen);
-  });
-}
-
-TEST(BlockPartition, BlockPartitionBasic) {
-  thrust::device_vector<int> x = std::vector<int>{0,1,2};
-  TestBlockPartition<256>(x);
-}
-
-TEST(BlockPartition, BlockPartition) {
-  int sizes[] = {1, 37, 1092};
-  int seeds[] = {0, 1, 2, 3, 4};
-  for (auto seed : seeds) {
-    for (auto size : sizes) {
-      thrust::device_vector<int> x(size);
-      MakeRandom(x, seed);
-      thrust::device_vector<int> y = x;
-      TestBlockPartition<1>(y);
-      y = x;
-      TestBlockPartition<1024>(y);
-      y = x;
-      TestBlockPartition<37>(y);
-    }
-  }
-}
-
-TEST(BlockPartition, BlockPartitionBenchmark) {
-  for (int i = 0; i < 20; i++) {
-    thrust::device_vector<int> x(10000000);
-    MakeRandom(x, i);
-    TestBlockPartition<1024>(x);
-  }
-}
-
 }  // namespace xgboost
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 9861e79bba66..a8672dc2ec1b 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -87,5 +87,27 @@ TEST(GpuHist, SortPositionBatch) {
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
 
+void TestAtomicIncrement(const std::vector<int>& group_in, const std::vector<int>& increment_in) {
+  thrust::device_vector<int> group(group_in);
+  thrust::device_vector<int> increment(increment_in);
+  thrust::device_vector<unsigned long long> reference(group_in.size());
+  thrust::device_vector<unsigned long long> result(group_in.size());
+
+  auto d_group = group.data().get();
+  auto d_increment = increment.data().get();
+  auto d_reference = reference.data().get();
+  auto d_result = result.data().get();
+  dh::LaunchN(group.size(), [=] __device__(std::size_t idx) {
+    AtomicIncrement(d_result, d_increment[idx], d_group[idx]);
+    atomicAdd(d_reference + d_group[idx], d_increment[idx]);
+  });
+
+  EXPECT_EQ(reference, result);
+}
+
+TEST(GpuHist, AtomicIncrement) {
+  TestAtomicIncrement({0, 0, 0}, {1, 0, 1});
+  TestAtomicIncrement({0, 0, 1}, {1, 0, 1});
+}
 }  // namespace tree
 }  // namespace xgboost

From 7450d68bbe5997db50ae8af56f47d267d7e8de8e Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 23 May 2022 04:33:42 -0700
Subject: [PATCH 34/64] Change lambda to kernel

---
 src/tree/gpu_hist/row_partitioner.cuh | 52 +++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 6 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index dc7605305912..95fe3d3454db 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -32,9 +32,8 @@ struct KernelBatchArgs {
   static const int kMaxBatch = 32;
   Segment segments[kMaxBatch];
   OpDataT data[kMaxBatch];
-
   // Given a global thread idx, assign it to an item from one of the segments
-  __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) const {
+  __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) {
     std::size_t sum = 0;
     for (int16_t i = 0; i < kMaxBatch; i++) {
       if (sum + segments[i].Size() > idx) {
@@ -96,22 +95,63 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo
   }
 }
 
+
+template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
+__global__ void GetLeftCountsKernel(KernelBatchArgs<OpDataT> args, common::Span<RowIndexT> ridx,
+                   common::Span<IndexFlagTuple> scan_inputs,
+                   common::Span<unsigned long long int> d_left_counts, OpT op, std::size_t n){
+
+    __shared__ KernelBatchArgs<OpDataT> s_args;
+
+    for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>); i += kBlockSize) {
+      reinterpret_cast<char*>(&s_args)[i] = reinterpret_cast<const char*>(&args)[i];
+    }
+    __syncthreads();
+    // Assign this thread to a row
+    std::size_t idx =  blockIdx.x *blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+    int16_t batch_idx;
+    std::size_t item_idx;
+    s_args.AssignBatch(idx, batch_idx, item_idx);
+    auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
+    scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res,
+                                      bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res};
+
+    AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
+}
+
+
 template <typename RowIndexT, typename OpT, typename OpDataT>
 void GetLeftCounts(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
                    common::Span<IndexFlagTuple> scan_inputs,
                    common::Span<unsigned long long int> d_left_counts, OpT op) {
   // Launch 1 thread for each row
-  dh::LaunchN<1, 128>(args.TotalRows(), [=] __device__(std::size_t idx) {
+  constexpr int kBlockSize = 256;
+  const int grid_size = 
+      static_cast<int>(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize));
+
+GetLeftCountsKernel<kBlockSize><<<grid_size,kBlockSize>>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows());
+
+/*
+  dh::LaunchN<1, kBlockSize>(args.TotalRows(), [=] __device__(std::size_t idx) {
+    __shared__ KernelBatchArgs<OpDataT> s_args;
+
+    for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>); i += kBlockSize) {
+      reinterpret_cast<char*>(&s_args)[i] = reinterpret_cast<const char*>(&args)[i];
+    }
+    __syncthreads();
+
     // Assign this thread to a row
     int16_t batch_idx;
     std::size_t item_idx;
-    args.AssignBatch(idx, batch_idx, item_idx);
-    auto op_res = op(ridx[item_idx], args.data[batch_idx]);
+    s_args.AssignBatch(idx, batch_idx, item_idx);
+    auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
     scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res,
                                       bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
 
-    AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], args.data[batch_idx]), batch_idx);
+    AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], s_args.data[batch_idx]), batch_idx);
   });
+  */
 }
 
 // This is a transformer output iterator

From 6df1259f375d7d5ca26eaed005d583c7eb989c0f Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 24 May 2022 04:46:42 -0700
Subject: [PATCH 35/64] Smem + lineinfo

---
 cmake/Utils.cmake                     |  1 +
 src/tree/gpu_hist/row_partitioner.cuh | 33 ++++++++++++++-------------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 963c494ccf26..6c124d625f3a 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -136,6 +136,7 @@ function(xgboost_set_cuda_flags target)
   target_compile_options(${target} PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
+    $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
     $<$<COMPILE_LANGUAGE:CUDA>:${GEN_CODE}>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 95fe3d3454db..a8fb15b97752 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -97,27 +97,27 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo
 
 
 template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
-__global__ void GetLeftCountsKernel(KernelBatchArgs<OpDataT> args, common::Span<RowIndexT> ridx,
+__global__ void GetLeftCountsKernel(const KernelBatchArgs<OpDataT> args, common::Span<RowIndexT> ridx,
                    common::Span<IndexFlagTuple> scan_inputs,
                    common::Span<unsigned long long int> d_left_counts, OpT op, std::size_t n){
 
     __shared__ KernelBatchArgs<OpDataT> s_args;
 
-    for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>); i += kBlockSize) {
-      reinterpret_cast<char*>(&s_args)[i] = reinterpret_cast<const char*>(&args)[i];
+    for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>)/8; i += kBlockSize) {
+      reinterpret_cast<int64_t*>(&s_args)[i] = reinterpret_cast<const int64_t*>(&args)[i];
     }
     __syncthreads();
-    // Assign this thread to a row
-    std::size_t idx =  blockIdx.x *blockDim.x + threadIdx.x;
-    if (idx >= n) return;
-    int16_t batch_idx;
-    std::size_t item_idx;
-    s_args.AssignBatch(idx, batch_idx, item_idx);
-    auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
-    scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res,
-                                      bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res};
-
-    AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
+    for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) {
+      int16_t batch_idx;
+      std::size_t item_idx;
+      s_args.AssignBatch(idx, batch_idx, item_idx);
+      auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
+      scan_inputs[idx] =
+          IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(s_args.segments[batch_idx].begin),
+                         batch_idx, op_res};
+
+      AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
+    }
 }
 
 
@@ -128,9 +128,10 @@ void GetLeftCounts(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT>
   // Launch 1 thread for each row
   constexpr int kBlockSize = 256;
   const int grid_size = 
-      static_cast<int>(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize));
+      std::max(256,static_cast<int>(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize)));
+
 
-GetLeftCountsKernel<kBlockSize><<<grid_size,kBlockSize>>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows());
+  GetLeftCountsKernel<kBlockSize><<<grid_size,kBlockSize>>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows());
 
 /*
   dh::LaunchN<1, kBlockSize>(args.TotalRows(), [=] __device__(std::size_t idx) {

From 40109427c1490c9426c9702c93747107b8436499 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 24 May 2022 05:51:57 -0700
Subject: [PATCH 36/64] Use stream

---
 src/tree/gpu_hist/row_partitioner.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a8fb15b97752..a0c5be5c6502 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -201,7 +201,7 @@ void SortPositionBatch(const KernelBatchArgs<OpDataT>& args, common::Span<RowInd
                                  discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
 
   // copy active segments back to original buffer
-  dh::LaunchN(args.TotalRows(), [=] __device__(std::size_t idx) {
+  dh::LaunchN(args.TotalRows(), stream, [=] __device__(std::size_t idx) {
     auto item_idx = scan_inputs[idx].idx;
     ridx[item_idx] = ridx_tmp[item_idx];
   });

From 1b13fe6802cc5aa2258d6f91ac0819c75b04e72c Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 25 May 2022 07:46:20 -0700
Subject: [PATCH 37/64] Fast global stores

---
 src/tree/gpu_hist/row_partitioner.cuh | 76 ++++++++++++++++++++-------
 1 file changed, 56 insertions(+), 20 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a0c5be5c6502..d806c8e87ca6 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -95,31 +95,67 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo
   }
 }
 
-
 template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
-__global__ void GetLeftCountsKernel(const KernelBatchArgs<OpDataT> args, common::Span<RowIndexT> ridx,
-                   common::Span<IndexFlagTuple> scan_inputs,
-                   common::Span<unsigned long long int> d_left_counts, OpT op, std::size_t n){
+__global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
+    const KernelBatchArgs<OpDataT> args, common::Span<RowIndexT> ridx,
+    common::Span<IndexFlagTuple> scan_inputs, common::Span<unsigned long long int> d_left_counts,
+    OpT op, std::size_t n) {
+  // Load this large struct in shared memory
+  // if left to its own devices the compiler loads this very slowly
+  __shared__ KernelBatchArgs<OpDataT> s_args;
+
+  for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>) / 8; i += kBlockSize) {
+    reinterpret_cast<int64_t*>(&s_args)[i] = reinterpret_cast<const int64_t*>(&args)[i];
+  }
+  __syncthreads();
 
-    __shared__ KernelBatchArgs<OpDataT> s_args;
+  // Global writes of IndexFlagTuple are inefficient due to its 16b size
+  // we can use cub to optimise this
+  static_assert(sizeof(IndexFlagTuple) == 16, "Expected IndexFlagTuple to be 16 bytes.");
+  constexpr int kTupleWords = sizeof(IndexFlagTuple)/sizeof(int);
+  typedef cub::BlockStore<int, kBlockSize, kTupleWords, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
+  __shared__ typename BlockStoreT::TempStorage temp_storage;
 
-    for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>)/8; i += kBlockSize) {
-      reinterpret_cast<int64_t*>(&s_args)[i] = reinterpret_cast<const int64_t*>(&args)[i];
-    }
-    __syncthreads();
-    for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < n; idx += blockDim.x * gridDim.x) {
-      int16_t batch_idx;
-      std::size_t item_idx;
-      s_args.AssignBatch(idx, batch_idx, item_idx);
-      auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
-      scan_inputs[idx] =
-          IndexFlagTuple{bst_uint(item_idx), op_res, bst_uint(s_args.segments[batch_idx].begin),
-                         batch_idx, op_res};
-
-      AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
+  // Use the raw pointer because the performance of global writes matters here
+  // We don't really need the bounds checking
+  IndexFlagTuple* out_ptr = scan_inputs.data();
+
+  auto get_tuple = [=]__device__ (auto idx){
+    int16_t batch_idx;
+    std::size_t item_idx;
+    s_args.AssignBatch(idx, batch_idx, item_idx);
+    auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
+    AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
+    return IndexFlagTuple{bst_uint(item_idx), op_res,
+                                  bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res};
+  };
+
+  // Process full tiles
+  std::size_t tile_offset = blockIdx.x * kBlockSize;
+  while (tile_offset + kBlockSize <= n) {
+    std::size_t idx = tile_offset + threadIdx.x;
+    auto tuple = get_tuple(idx);
+    auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
+    BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)));
+    tile_offset += kBlockSize * gridDim.x;
+  }
+
+  // Process partial tile
+  if (tile_offset < n) {
+    // Make sure we don't compute a negative number with unsigned integers
+    int valid_items = int(int64_t(n) - int64_t(tile_offset));
+    std::size_t idx = tile_offset + threadIdx.x;
+    IndexFlagTuple tuple;
+    if (idx < n) {
+      tuple = get_tuple(idx);
     }
-}
 
+    auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
+    BlockStoreT(temp_storage)
+        .Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)),
+               valid_items * kTupleWords);
+  }
+}
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
 void GetLeftCounts(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT> ridx,

From 24fb339c46bd7d4f9070c16d4951f03f74e96b71 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 27 May 2022 06:06:25 -0700
Subject: [PATCH 38/64] Fast load without shmem

---
 src/tree/gpu_hist/row_partitioner.cuh         | 39 ++++++++++++++-----
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  2 +-
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index d806c8e87ca6..a16989a165bd 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -26,14 +26,15 @@ struct Segment {
   __host__ __device__ size_t Size() const { return end - begin; }
 };
 
-
 template <typename OpDataT>
 struct KernelBatchArgs {
   static const int kMaxBatch = 32;
   Segment segments[kMaxBatch];
   OpDataT data[kMaxBatch];
+
+  KernelBatchArgs() = default;
   // Given a global thread idx, assign it to an item from one of the segments
-  __device__ void AssignBatch(std::size_t idx, int16_t &batch_idx, std::size_t &item_idx) {
+  __device__ void AssignBatch(std::size_t idx, int16_t& batch_idx, std::size_t& item_idx) const {
     std::size_t sum = 0;
     for (int16_t i = 0; i < kMaxBatch; i++) {
       if (sum + segments[i].Size() > idx) {
@@ -95,6 +96,17 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo
   }
 }
 
+template <typename RowIndexT,typename OpT, typename OpDataT>
+__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span<RowIndexT> ridx, common::Span<unsigned long long int> d_left_counts,const KernelBatchArgs<OpDataT> &args, OpT op){
+    int16_t batch_idx;
+    std::size_t item_idx;
+    args.AssignBatch(idx, batch_idx, item_idx);
+    auto op_res = op(ridx[item_idx], args.data[batch_idx]);
+    AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
+    return IndexFlagTuple{bst_uint(item_idx), op_res,
+                                  bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
+}
+
 template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
     const KernelBatchArgs<OpDataT> args, common::Span<RowIndexT> ridx,
@@ -102,12 +114,16 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
     OpT op, std::size_t n) {
   // Load this large struct in shared memory
   // if left to its own devices the compiler loads this very slowly
-  __shared__ KernelBatchArgs<OpDataT> s_args;
-
-  for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>) / 8; i += kBlockSize) {
-    reinterpret_cast<int64_t*>(&s_args)[i] = reinterpret_cast<const int64_t*>(&args)[i];
+  //__shared__ KernelBatchArgs<OpDataT> s_args;
+  /*
+  __shared__ cub::Uninitialized<KernelBatchArgs<OpDataT>> s_temp;
+  KernelBatchArgs<OpDataT>& s_args = s_temp.Alias();
+  for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>) / 4; i += kBlockSize) {
+    reinterpret_cast<int*>(&s_args)[i] = reinterpret_cast<const int*>(&args)[i];
   }
+  
   __syncthreads();
+  */
 
   // Global writes of IndexFlagTuple are inefficient due to its 16b size
   // we can use cub to optimise this
@@ -120,7 +136,8 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
   // We don't really need the bounds checking
   IndexFlagTuple* out_ptr = scan_inputs.data();
 
-  auto get_tuple = [=]__device__ (auto idx){
+  /*
+  auto get_tuple = [&]__device__ (auto idx){
     int16_t batch_idx;
     std::size_t item_idx;
     s_args.AssignBatch(idx, batch_idx, item_idx);
@@ -129,12 +146,14 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
     return IndexFlagTuple{bst_uint(item_idx), op_res,
                                   bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res};
   };
+  */
 
   // Process full tiles
   std::size_t tile_offset = blockIdx.x * kBlockSize;
   while (tile_offset + kBlockSize <= n) {
     std::size_t idx = tile_offset + threadIdx.x;
-    auto tuple = get_tuple(idx);
+    //auto tuple = get_tuple(idx);
+    auto tuple = GetTuple(idx,ridx,d_left_counts,args,op);
     auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
     BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)));
     tile_offset += kBlockSize * gridDim.x;
@@ -147,7 +166,8 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
     std::size_t idx = tile_offset + threadIdx.x;
     IndexFlagTuple tuple;
     if (idx < n) {
-      tuple = get_tuple(idx);
+      tuple = GetTuple(idx,ridx,d_left_counts,args,op);
+      //tuple = get_tuple(idx);
     }
 
     auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
@@ -225,6 +245,7 @@ void SortPositionBatch(const KernelBatchArgs<OpDataT>& args, common::Span<RowInd
                        cudaStream_t stream) {
   static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned.");
   WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()};
+
   auto discard_write_iterator =
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index a8672dc2ec1b..0fded6828236 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -64,7 +64,7 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   std::copy(segments.begin(), segments.end(), args.segments);
   std::copy(op_data.begin(), op_data.end(), args.data);
   GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(scan_tmp),dh::ToSpan(left_counts), op);
-  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts), 
+  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts),
                     nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };

From f40fe94e99d909e8601013d19b6fa8fc1590df84 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 30 May 2022 04:59:35 -0700
Subject: [PATCH 39/64] Memcpy version

---
 src/tree/gpu_hist/row_partitioner.cuh         | 121 ++++++++++++------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  18 ++-
 2 files changed, 94 insertions(+), 45 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a16989a165bd..3d47b986f240 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -54,6 +54,29 @@ struct KernelBatchArgs {
   }
 };
 
+template <typename OpDataT>
+struct KernelMemcpyArgs {
+  Segment segment;
+  OpDataT data;
+};
+
+template <typename OpDataT>
+__device__ void AssignBatch(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
+                            std::size_t idx, int16_t& batch_idx, std::size_t& item_idx, OpDataT&data) {
+  const auto ptr = batch_info.data();
+  std::size_t sum = 0;
+
+  for (int16_t i = 0; i < batch_info.size(); i++) {
+    if (sum + ptr[i].segment.Size() > idx) {
+      batch_idx = i;
+      item_idx = (idx - sum) + ptr[i].segment.begin;
+      data = ptr[i].data;
+      break;
+    }
+    sum += ptr[i].segment.Size();
+  }
+}
+
 // We can scan over this tuple, where the scan gives us information on how to partition inputs
 // according to the flag
 struct IndexFlagTuple {
@@ -97,11 +120,13 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo
 }
 
 template <typename RowIndexT,typename OpT, typename OpDataT>
-__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span<RowIndexT> ridx, common::Span<unsigned long long int> d_left_counts,const KernelBatchArgs<OpDataT> &args, OpT op){
+__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span<RowIndexT> ridx, common::Span<unsigned long long int> d_left_counts,const KernelBatchArgs<OpDataT> &args, const common::Span<KernelMemcpyArgs<OpDataT>> batch_info, OpT op){
     int16_t batch_idx;
     std::size_t item_idx;
-    args.AssignBatch(idx, batch_idx, item_idx);
-    auto op_res = op(ridx[item_idx], args.data[batch_idx]);
+    OpDataT data;
+    AssignBatch(batch_info,idx, batch_idx, item_idx, data);
+    //args.AssignBatch(idx, batch_idx, item_idx);
+    auto op_res = op(ridx[item_idx], data);
     AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
     return IndexFlagTuple{bst_uint(item_idx), op_res,
                                   bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
@@ -109,7 +134,8 @@ __device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span
 
 template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
-    const KernelBatchArgs<OpDataT> args, common::Span<RowIndexT> ridx,
+    const KernelBatchArgs<OpDataT> args, const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
+    common::Span<RowIndexT> ridx,
     common::Span<IndexFlagTuple> scan_inputs, common::Span<unsigned long long int> d_left_counts,
     OpT op, std::size_t n) {
   // Load this large struct in shared memory
@@ -121,7 +147,7 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
   for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>) / 4; i += kBlockSize) {
     reinterpret_cast<int*>(&s_args)[i] = reinterpret_cast<const int*>(&args)[i];
   }
-  
+
   __syncthreads();
   */
 
@@ -153,9 +179,9 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
   while (tile_offset + kBlockSize <= n) {
     std::size_t idx = tile_offset + threadIdx.x;
     //auto tuple = get_tuple(idx);
-    auto tuple = GetTuple(idx,ridx,d_left_counts,args,op);
+    auto tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op);
     auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
-    BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)));
+    //BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)));
     tile_offset += kBlockSize * gridDim.x;
   }
 
@@ -166,20 +192,23 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
     std::size_t idx = tile_offset + threadIdx.x;
     IndexFlagTuple tuple;
     if (idx < n) {
-      tuple = GetTuple(idx,ridx,d_left_counts,args,op);
+      tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op);
       //tuple = get_tuple(idx);
     }
 
+    /*
     auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
     BlockStoreT(temp_storage)
         .Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)),
                valid_items * kTupleWords);
+               */
   }
 }
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void GetLeftCounts(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
-                   common::Span<IndexFlagTuple> scan_inputs,
+void GetLeftCounts(const KernelBatchArgs<OpDataT>& args,
+                   const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
+                   common::Span<RowIndexT> ridx, common::Span<IndexFlagTuple> scan_inputs,
                    common::Span<unsigned long long int> d_left_counts, OpT op) {
   // Launch 1 thread for each row
   constexpr int kBlockSize = 256;
@@ -187,28 +216,7 @@ void GetLeftCounts(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT>
       std::max(256,static_cast<int>(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize)));
 
 
-  GetLeftCountsKernel<kBlockSize><<<grid_size,kBlockSize>>>(args, ridx, scan_inputs,d_left_counts,op, args.TotalRows());
-
-/*
-  dh::LaunchN<1, kBlockSize>(args.TotalRows(), [=] __device__(std::size_t idx) {
-    __shared__ KernelBatchArgs<OpDataT> s_args;
-
-    for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>); i += kBlockSize) {
-      reinterpret_cast<char*>(&s_args)[i] = reinterpret_cast<const char*>(&args)[i];
-    }
-    __syncthreads();
-
-    // Assign this thread to a row
-    int16_t batch_idx;
-    std::size_t item_idx;
-    s_args.AssignBatch(idx, batch_idx, item_idx);
-    auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
-    scan_inputs[idx] = IndexFlagTuple{bst_uint(item_idx), op_res,
-                                      bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
-
-    AtomicIncrement(d_left_counts.data(), op(ridx[item_idx], s_args.data[batch_idx]), batch_idx);
-  });
-  */
+  GetLeftCountsKernel<kBlockSize><<<grid_size,kBlockSize>>>(args, batch_info,ridx, scan_inputs,d_left_counts,op, args.TotalRows());
 }
 
 // This is a transformer output iterator
@@ -238,28 +246,43 @@ struct WriteResultsFunctor {
   }
 };
 
-template <typename RowIndexT, typename OpDataT>
-void SortPositionBatch(const KernelBatchArgs<OpDataT>& args, common::Span<RowIndexT> ridx,
+template <typename RowIndexT, typename OpT, typename OpDataT>
+void SortPositionBatch(const KernelBatchArgs<OpDataT>& args,
+                       const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
+                       common::Span<RowIndexT> ridx,
                        common::Span<RowIndexT> ridx_tmp, common::Span<IndexFlagTuple> scan_inputs,
-                       common::Span<unsigned long long int> left_counts,
-                       cudaStream_t stream) {
+                       common::Span<unsigned long long int> left_counts, OpT op,cudaStream_t stream) {
   static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned.");
   WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()};
 
   auto discard_write_iterator =
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
+  auto input_iterator =
+      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
+        int16_t batch_idx;
+        std::size_t item_idx;
+        OpDataT data;
+        AssignBatch(batch_info, idx, batch_idx, item_idx, data);
+        auto op_res = op(ridx[item_idx], data);
+        return IndexFlagTuple{bst_uint(item_idx), op_res,
+                              bst_uint(batch_info.data()[batch_idx].segment.begin), batch_idx,
+                              op_res};
+      });
   size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, scan_inputs.data(),
+  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
                                  discard_write_iterator, IndexFlagOp(),
                                  args.TotalRows(), stream);
   dh::TemporaryArray<int8_t> temp(temp_bytes);
-  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes,  scan_inputs.data(),
+  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
                                  discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
 
   // copy active segments back to original buffer
   dh::LaunchN(args.TotalRows(), stream, [=] __device__(std::size_t idx) {
-    auto item_idx = scan_inputs[idx].idx;
+    int16_t batch_idx;
+    std::size_t item_idx;
+    OpDataT data;
+    AssignBatch(batch_info, idx, batch_idx, item_idx, data);
     ridx[item_idx] = ridx_tmp[item_idx];
   });
 }
@@ -291,6 +314,7 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
   dh::TemporaryArray<IndexFlagTuple> scan_inputs_;
   dh::PinnedMemory pinned_;
+  dh::PinnedMemory pinned2_;
   std::vector<cudaStream_t> streams_;
 
  public:
@@ -324,6 +348,18 @@ class RowPartitioner {
     CHECK_EQ(nidx.size(), right_nidx.size());
     CHECK_EQ(nidx.size(), op_data.size());
 
+    auto h_batch_info = pinned2_.GetSpan<KernelMemcpyArgs<OpDataT>>(nidx.size());
+    dh::TemporaryArray<KernelMemcpyArgs<OpDataT>> d_batch_info(nidx.size());
+
+    std::size_t total_rows = 0;
+    for (int i = 0; i < nidx.size(); i++) {
+      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)), op_data.at(i)};
+      total_rows += ridx_segments_.at(nidx.at(i)).Size();
+    }
+    dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
+                                  h_batch_info.size() * sizeof(KernelMemcpyArgs<OpDataT>),
+                                  cudaMemcpyDefault, streams_[0]));
+
     // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying
     // memory from device to host
     for (std::size_t batch_start = 0; batch_start < nidx.size();
@@ -343,7 +379,8 @@ class RowPartitioner {
       // Evaluate the operator for each row, where true means 'go left'
       // Store the result of the operator for the next step
       // Count the number of rows going left, store in d_left_counts
-      GetLeftCounts(args, dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op);
+      GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_),
+                    dh::ToSpan(d_left_counts), op);
 
       // Start copying the counts to the host
       // We overlap this transfer with the sort step using streams
@@ -354,8 +391,8 @@ class RowPartitioner {
                           cudaMemcpyDefault, streams_[0]));
 
       // Partition the rows according to the operator
-      SortPositionBatch(args, dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(scan_inputs_),
-                        dh::ToSpan(d_left_counts), streams_[1]);
+      SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),
+                        dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op,streams_[1]);
 
       dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
 
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 0fded6828236..c29032c5790d 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -60,12 +60,24 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
 
   auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
+  std::vector<KernelMemcpyArgs<int>> h_batch_info(segments.size());
+  dh::TemporaryArray<KernelMemcpyArgs<int>> d_batch_info(segments.size());
+
+  std::size_t total_rows = 0;
+  for (int i = 0; i < segments.size(); i++) {
+    h_batch_info[i] = {segments.at(i), 0};
+    total_rows += segments.at(i).Size();
+  }
+  dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
+                                h_batch_info.size() * sizeof(KernelMemcpyArgs<int>),
+                                cudaMemcpyDefault, nullptr));
   KernelBatchArgs<int> args;
   std::copy(segments.begin(), segments.end(), args.segments);
   std::copy(op_data.begin(), op_data.end(), args.data);
-  GetLeftCounts(args, dh::ToSpan(ridx), dh::ToSpan(scan_tmp),dh::ToSpan(left_counts), op);
-  SortPositionBatch(args, dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(scan_tmp), dh::ToSpan(left_counts),
-                    nullptr);
+  GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(scan_tmp),
+                dh::ToSpan(left_counts), op);
+  SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp),
+                    dh::ToSpan(scan_tmp), dh::ToSpan(left_counts),op, nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (int i = 0; i < segments.size(); i++) {

From 7d5d7e71e93d765a14d9c67df21d2e3c9c045758 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 30 May 2022 08:32:42 -0700
Subject: [PATCH 40/64] Remove left counts kernel

---
 src/tree/gpu_hist/row_partitioner.cuh | 47 +++++++++++++++------------
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 3d47b986f240..05debc22e603 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -83,6 +83,7 @@ struct IndexFlagTuple {
   bst_uint idx;            // The location of the item we are working on in ridx_
   bst_uint flag_scan;      // This gets populated after scanning
   bst_uint segment_start;  // Start offset of this node segment
+  bst_uint segment_end;  // End offset of this node segment
   int16_t batch_idx;       // Which node in the batch does this item belong to
   bool flag;               // Result of op (is this item going left?)
 };
@@ -92,7 +93,7 @@ struct IndexFlagOp {
     // Segmented scan - resets if we cross batch boundaries
     if (a.batch_idx == b.batch_idx) {
       // Accumulate the flags, everything else stays the same
-      return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.batch_idx, b.flag};
+      return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.segment_end,b.batch_idx, b.flag};
     } else {
       return b;
     }
@@ -129,7 +130,7 @@ __device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span
     auto op_res = op(ridx[item_idx], data);
     AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
     return IndexFlagTuple{bst_uint(item_idx), op_res,
-                                  bst_uint(args.segments[batch_idx].begin), batch_idx, op_res};
+                                  bst_uint(args.segments[batch_idx].begin),bst_uint(args.segments[batch_idx].end), batch_idx, op_res};
 }
 
 template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
@@ -153,7 +154,6 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
 
   // Global writes of IndexFlagTuple are inefficient due to its 16b size
   // we can use cub to optimise this
-  static_assert(sizeof(IndexFlagTuple) == 16, "Expected IndexFlagTuple to be 16 bytes.");
   constexpr int kTupleWords = sizeof(IndexFlagTuple)/sizeof(int);
   typedef cub::BlockStore<int, kBlockSize, kTupleWords, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
   __shared__ typename BlockStoreT::TempStorage temp_storage;
@@ -181,7 +181,7 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
     //auto tuple = get_tuple(idx);
     auto tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op);
     auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
-    //BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)));
+    BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)));
     tile_offset += kBlockSize * gridDim.x;
   }
 
@@ -196,12 +196,10 @@ __global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
       //tuple = get_tuple(idx);
     }
 
-    /*
     auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
     BlockStoreT(temp_storage)
         .Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)),
                valid_items * kTupleWords);
-               */
   }
 }
 
@@ -234,13 +232,23 @@ struct WriteResultsFunctor {
     // node so far during scan.
     std::size_t scatter_address;
     if (x.flag) {
-      scatter_address = x.segment_start + x.flag_scan - 1;  // -1 because inclusive scan
+      bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan
+      scatter_address = x.segment_start + num_previous_flagged;  
     } else {
+
+      bst_uint num_previous_unflagged = (x.idx - x.segment_start) - x.flag_scan;
       // current number of rows belong to right node + total number of rows
       // belong to left node
-      scatter_address  = (x.idx - x.flag_scan) + left_counts[x.batch_idx];
+      // scatter_address  = (x.idx - x.flag_scan) + left_counts[x.batch_idx];
+      scatter_address = x.segment_end - num_previous_unflagged - 1;
     }
     ridx_out[scatter_address] = ridx_in[x.idx];
+
+    if (x.idx == (x.segment_end - 1)) {
+      // Write out counts
+      left_counts[x.batch_idx] = x.flag_scan;
+    }
+
     // Discard
     return {};
   }
@@ -252,7 +260,7 @@ void SortPositionBatch(const KernelBatchArgs<OpDataT>& args,
                        common::Span<RowIndexT> ridx,
                        common::Span<RowIndexT> ridx_tmp, common::Span<IndexFlagTuple> scan_inputs,
                        common::Span<unsigned long long int> left_counts, OpT op,cudaStream_t stream) {
-  static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned.");
+  //static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned.");
   WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()};
 
   auto discard_write_iterator =
@@ -266,7 +274,7 @@ void SortPositionBatch(const KernelBatchArgs<OpDataT>& args,
         AssignBatch(batch_info, idx, batch_idx, item_idx, data);
         auto op_res = op(ridx[item_idx], data);
         return IndexFlagTuple{bst_uint(item_idx), op_res,
-                              bst_uint(batch_info.data()[batch_idx].segment.begin), batch_idx,
+                              bst_uint(batch_info.data()[batch_idx].segment.begin),bst_uint(batch_info.data()[batch_idx].segment.end), batch_idx,
                               op_res};
       });
   size_t temp_bytes = 0;
@@ -358,7 +366,7 @@ class RowPartitioner {
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(KernelMemcpyArgs<OpDataT>),
-                                  cudaMemcpyDefault, streams_[0]));
+                                  cudaMemcpyDefault, streams_[1]));
 
     // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying
     // memory from device to host
@@ -379,22 +387,19 @@ class RowPartitioner {
       // Evaluate the operator for each row, where true means 'go left'
       // Store the result of the operator for the next step
       // Count the number of rows going left, store in d_left_counts
-      GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_),
-                    dh::ToSpan(d_left_counts), op);
+      //GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_),
+                    //dh::ToSpan(d_left_counts), op);
 
-      // Start copying the counts to the host
-      // We overlap this transfer with the sort step using streams
-      // We only need the result after sorting to update the segment boundaries
-      dh::safe_cuda(
-          cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(),
-                          sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(),
-                          cudaMemcpyDefault, streams_[0]));
 
       // Partition the rows according to the operator
       SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),
                         dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op,streams_[1]);
+      dh::safe_cuda(
+          cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(),
+                          sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(),
+                          cudaMemcpyDefault, streams_[1]));
 
-      dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
+      dh::safe_cuda(cudaStreamSynchronize(streams_[1]));
 
       // Update segments
       for (int i = 0; i < (batch_end - batch_start); i++) {

From 77f85504d583c04f1328158125d65b2c5fffd3bd Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 31 May 2022 03:02:37 -0700
Subject: [PATCH 41/64] Unstable partition

---
 src/tree/gpu_hist/row_partitioner.cu          |   2 +-
 src/tree/gpu_hist/row_partitioner.cuh         | 280 ++++++------------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  19 +-
 3 files changed, 95 insertions(+), 206 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index b079189e6b7d..7676c8e67495 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -12,7 +12,7 @@ namespace xgboost {
 namespace tree {
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows), scan_inputs_(num_rows) {
+    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(Segment(0, num_rows));
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 05debc22e603..f062e2a4ed48 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -26,33 +26,7 @@ struct Segment {
   __host__ __device__ size_t Size() const { return end - begin; }
 };
 
-template <typename OpDataT>
-struct KernelBatchArgs {
-  static const int kMaxBatch = 32;
-  Segment segments[kMaxBatch];
-  OpDataT data[kMaxBatch];
-
-  KernelBatchArgs() = default;
-  // Given a global thread idx, assign it to an item from one of the segments
-  __device__ void AssignBatch(std::size_t idx, int16_t& batch_idx, std::size_t& item_idx) const {
-    std::size_t sum = 0;
-    for (int16_t i = 0; i < kMaxBatch; i++) {
-      if (sum + segments[i].Size() > idx) {
-        batch_idx = i;
-        item_idx = (idx - sum) + segments[i].begin;
-        break;
-      }
-      sum += segments[i].Size();
-    }
-  }
-  std::size_t TotalRows() const {
-    std::size_t total_rows = 0;
-    for (auto segment : segments) {
-      total_rows += segment.Size();
-    }
-    return total_rows;
-  }
-};
+using PartitionCountsT = thrust::pair<bst_uint,bst_uint>;
 
 template <typename OpDataT>
 struct KernelMemcpyArgs {
@@ -120,103 +94,6 @@ __forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bo
   }
 }
 
-template <typename RowIndexT,typename OpT, typename OpDataT>
-__device__ __forceinline__ IndexFlagTuple GetTuple(std::size_t idx, common::Span<RowIndexT> ridx, common::Span<unsigned long long int> d_left_counts,const KernelBatchArgs<OpDataT> &args, const common::Span<KernelMemcpyArgs<OpDataT>> batch_info, OpT op){
-    int16_t batch_idx;
-    std::size_t item_idx;
-    OpDataT data;
-    AssignBatch(batch_info,idx, batch_idx, item_idx, data);
-    //args.AssignBatch(idx, batch_idx, item_idx);
-    auto op_res = op(ridx[item_idx], data);
-    AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
-    return IndexFlagTuple{bst_uint(item_idx), op_res,
-                                  bst_uint(args.segments[batch_idx].begin),bst_uint(args.segments[batch_idx].end), batch_idx, op_res};
-}
-
-template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
-__global__ __launch_bounds__(kBlockSize) void GetLeftCountsKernel(
-    const KernelBatchArgs<OpDataT> args, const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
-    common::Span<RowIndexT> ridx,
-    common::Span<IndexFlagTuple> scan_inputs, common::Span<unsigned long long int> d_left_counts,
-    OpT op, std::size_t n) {
-  // Load this large struct in shared memory
-  // if left to its own devices the compiler loads this very slowly
-  //__shared__ KernelBatchArgs<OpDataT> s_args;
-  /*
-  __shared__ cub::Uninitialized<KernelBatchArgs<OpDataT>> s_temp;
-  KernelBatchArgs<OpDataT>& s_args = s_temp.Alias();
-  for (int i = threadIdx.x; i < sizeof(KernelBatchArgs<OpDataT>) / 4; i += kBlockSize) {
-    reinterpret_cast<int*>(&s_args)[i] = reinterpret_cast<const int*>(&args)[i];
-  }
-
-  __syncthreads();
-  */
-
-  // Global writes of IndexFlagTuple are inefficient due to its 16b size
-  // we can use cub to optimise this
-  constexpr int kTupleWords = sizeof(IndexFlagTuple)/sizeof(int);
-  typedef cub::BlockStore<int, kBlockSize, kTupleWords, cub::BLOCK_STORE_WARP_TRANSPOSE> BlockStoreT;
-  __shared__ typename BlockStoreT::TempStorage temp_storage;
-
-  // Use the raw pointer because the performance of global writes matters here
-  // We don't really need the bounds checking
-  IndexFlagTuple* out_ptr = scan_inputs.data();
-
-  /*
-  auto get_tuple = [&]__device__ (auto idx){
-    int16_t batch_idx;
-    std::size_t item_idx;
-    s_args.AssignBatch(idx, batch_idx, item_idx);
-    auto op_res = op(ridx[item_idx], s_args.data[batch_idx]);
-    AtomicIncrement(d_left_counts.data(), op_res, batch_idx);
-    return IndexFlagTuple{bst_uint(item_idx), op_res,
-                                  bst_uint(s_args.segments[batch_idx].begin), batch_idx, op_res};
-  };
-  */
-
-  // Process full tiles
-  std::size_t tile_offset = blockIdx.x * kBlockSize;
-  while (tile_offset + kBlockSize <= n) {
-    std::size_t idx = tile_offset + threadIdx.x;
-    //auto tuple = get_tuple(idx);
-    auto tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op);
-    auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
-    BlockStoreT(temp_storage).Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)));
-    tile_offset += kBlockSize * gridDim.x;
-  }
-
-  // Process partial tile
-  if (tile_offset < n) {
-    // Make sure we don't compute a negative number with unsigned integers
-    int valid_items = int(int64_t(n) - int64_t(tile_offset));
-    std::size_t idx = tile_offset + threadIdx.x;
-    IndexFlagTuple tuple;
-    if (idx < n) {
-      tuple = GetTuple(idx,ridx,d_left_counts,args,batch_info,op);
-      //tuple = get_tuple(idx);
-    }
-
-    auto block_write_ptr = reinterpret_cast<int*>(out_ptr + tile_offset);
-    BlockStoreT(temp_storage)
-        .Store(block_write_ptr, *static_cast<int(*)[kTupleWords]>(static_cast<void*>(&tuple)),
-               valid_items * kTupleWords);
-  }
-}
-
-template <typename RowIndexT, typename OpT, typename OpDataT>
-void GetLeftCounts(const KernelBatchArgs<OpDataT>& args,
-                   const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
-                   common::Span<RowIndexT> ridx, common::Span<IndexFlagTuple> scan_inputs,
-                   common::Span<unsigned long long int> d_left_counts, OpT op) {
-  // Launch 1 thread for each row
-  constexpr int kBlockSize = 256;
-  const int grid_size = 
-      std::max(256,static_cast<int>(xgboost::common::DivRoundUp(args.TotalRows(), kBlockSize)));
-
-
-  GetLeftCountsKernel<kBlockSize><<<grid_size,kBlockSize>>>(args, batch_info,ridx, scan_inputs,d_left_counts,op, args.TotalRows());
-}
-
 // This is a transformer output iterator
 // It takes the result of the scan and performs the partition
 // To understand how a scan is used to partition elements see:
@@ -225,11 +102,9 @@ void GetLeftCounts(const KernelBatchArgs<OpDataT>& args,
 struct WriteResultsFunctor {
   bst_uint* ridx_in;
   bst_uint* ridx_out;
-  unsigned long long int* left_counts;
+  PartitionCountsT *counts;
 
   __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
-    // the ex_scan_result represents how many rows have been assigned to left
-    // node so far during scan.
     std::size_t scatter_address;
     if (x.flag) {
       bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan
@@ -237,16 +112,13 @@ struct WriteResultsFunctor {
     } else {
 
       bst_uint num_previous_unflagged = (x.idx - x.segment_start) - x.flag_scan;
-      // current number of rows belong to right node + total number of rows
-      // belong to left node
-      // scatter_address  = (x.idx - x.flag_scan) + left_counts[x.batch_idx];
       scatter_address = x.segment_end - num_previous_unflagged - 1;
     }
     ridx_out[scatter_address] = ridx_in[x.idx];
 
     if (x.idx == (x.segment_end - 1)) {
       // Write out counts
-      left_counts[x.batch_idx] = x.flag_scan;
+      counts[x.batch_idx] = {x.flag_scan,0};
     }
 
     // Discard
@@ -255,13 +127,11 @@ struct WriteResultsFunctor {
 };
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatch(const KernelBatchArgs<OpDataT>& args,
-                       const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
-                       common::Span<RowIndexT> ridx,
-                       common::Span<RowIndexT> ridx_tmp, common::Span<IndexFlagTuple> scan_inputs,
-                       common::Span<unsigned long long int> left_counts, OpT op,cudaStream_t stream) {
-  //static_assert(sizeof(IndexFlagTuple) == 16, "Struct should be 16 bytes aligned.");
-  WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), left_counts.data()};
+void SortPositionBatch(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
+                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
+                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
+                       OpT op, cudaStream_t stream) {
+  WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()};
 
   auto discard_write_iterator =
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
@@ -273,20 +143,67 @@ void SortPositionBatch(const KernelBatchArgs<OpDataT>& args,
         OpDataT data;
         AssignBatch(batch_info, idx, batch_idx, item_idx, data);
         auto op_res = op(ridx[item_idx], data);
-        return IndexFlagTuple{bst_uint(item_idx), op_res,
-                              bst_uint(batch_info.data()[batch_idx].segment.begin),bst_uint(batch_info.data()[batch_idx].segment.end), batch_idx,
+        return IndexFlagTuple{bst_uint(item_idx),
+                              op_res,
+                              bst_uint(batch_info.data()[batch_idx].segment.begin),
+                              bst_uint(batch_info.data()[batch_idx].segment.end),
+                              batch_idx,
                               op_res};
       });
   size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(),
-                                 args.TotalRows(), stream);
+  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
+                                 IndexFlagOp(), total_rows, stream);
   dh::TemporaryArray<int8_t> temp(temp_bytes);
   cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), args.TotalRows(), stream);
+                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
 
   // copy active segments back to original buffer
-  dh::LaunchN(args.TotalRows(), stream, [=] __device__(std::size_t idx) {
+  dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) {
+    int16_t batch_idx;
+    std::size_t item_idx;
+    OpDataT data;
+    AssignBatch(batch_info, idx, batch_idx, item_idx, data);
+    ridx[item_idx] = ridx_tmp[item_idx];
+  });
+}
+
+template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
+__global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel(
+    const common::Span<KernelMemcpyArgs<OpDataT>> batch_info, common::Span<RowIndexT> ridx,
+    common::Span<RowIndexT> ridx_tmp, common::Span<PartitionCountsT> counts, OpT op,
+    std::size_t total_rows) {
+  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) {
+    int16_t batch_idx;
+    std::size_t item_idx;
+    OpDataT data;
+    AssignBatch(batch_info, idx, batch_idx, item_idx, data);
+    auto segment = batch_info[batch_idx].segment;
+    auto op_res = op(ridx[item_idx], data);
+    if (op_res) {
+      auto num_left_items = atomicAdd(&counts.data()[batch_idx].first, 1);
+      ridx_tmp[segment.begin + num_left_items] = ridx[item_idx];
+    } else {
+      auto num_right_items = atomicAdd(&counts.data()[batch_idx].second, 1);
+      ridx_tmp[segment.end - num_right_items - 1] = ridx[item_idx];
+    }
+  }
+}
+
+template <typename RowIndexT, typename OpT, typename OpDataT>
+void SortPositionBatchUnstable(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
+                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
+                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
+                       OpT op, cudaStream_t stream) {
+                    
+  constexpr int kBlockSize = 256;
+  const int grid_size =
+      std::max(256, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
+
+  SortPositionBatchUnstableKernel<kBlockSize>
+      <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows);
+
+  // copy active segments back to original buffer
+  dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) {
     int16_t batch_idx;
     std::size_t item_idx;
     OpDataT data;
@@ -320,7 +237,6 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
-  dh::TemporaryArray<IndexFlagTuple> scan_inputs_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
   std::vector<cudaStream_t> streams_;
@@ -368,53 +284,31 @@ class RowPartitioner {
                                   h_batch_info.size() * sizeof(KernelMemcpyArgs<OpDataT>),
                                   cudaMemcpyDefault, streams_[1]));
 
-    // Process nodes in batches to amortise the fixed latency costs of launching kernels and copying
-    // memory from device to host
-    for (std::size_t batch_start = 0; batch_start < nidx.size();
-         batch_start += KernelBatchArgs<OpDataT>::kMaxBatch) {
-      // Temporary arrays
-      auto h_left_counts = pinned_.GetSpan<int64_t>(KernelBatchArgs<OpDataT>::kMaxBatch, 0);
-      dh::TemporaryArray<unsigned long long int> d_left_counts(KernelBatchArgs<OpDataT>::kMaxBatch, 0);
-
-      std::size_t batch_end = std::min(batch_start + KernelBatchArgs<OpDataT>::kMaxBatch, nidx.size());
-      // Prepare kernel arguments
-      KernelBatchArgs<OpDataT> args;
-      std::copy(op_data.begin() + batch_start, op_data.begin() + batch_end, args.data);
-      for (int i = 0; i < (batch_end - batch_start); i++) {
-        args.segments[i] = ridx_segments_.at(nidx[batch_start + i]);
-      }
-
-      // Evaluate the operator for each row, where true means 'go left'
-      // Store the result of the operator for the next step
-      // Count the number of rows going left, store in d_left_counts
-      //GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(scan_inputs_),
-                    //dh::ToSpan(d_left_counts), op);
-
-
-      // Partition the rows according to the operator
-      SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),
-                        dh::ToSpan(scan_inputs_), dh::ToSpan(d_left_counts), op,streams_[1]);
-      dh::safe_cuda(
-          cudaMemcpyAsync(h_left_counts.data(), d_left_counts.data().get(),
-                          sizeof(decltype(d_left_counts)::value_type) * d_left_counts.size(),
-                          cudaMemcpyDefault, streams_[1]));
-
-      dh::safe_cuda(cudaStreamSynchronize(streams_[1]));
-
-      // Update segments
-      for (int i = 0; i < (batch_end - batch_start); i++) {
-        auto segment = ridx_segments_.at(nidx[batch_start + i]);
-        auto left_count = h_left_counts[i];
-        CHECK_LE(left_count, segment.Size());
-        CHECK_GE(left_count, 0);
-        ridx_segments_.resize(
-            std::max(static_cast<bst_node_t>(ridx_segments_.size()),
-                     std::max(left_nidx[batch_start + i], right_nidx[batch_start + i]) + 1));
-        ridx_segments_[left_nidx[batch_start + i]] =
-            Segment(segment.begin, segment.begin + left_count);
-        ridx_segments_[right_nidx[batch_start + i]] =
-            Segment(segment.begin + left_count, segment.end);
-      }
+    // Temporary arrays
+    auto h_counts = pinned_.GetSpan<PartitionCountsT>(nidx.size(), PartitionCountsT{});
+    dh::TemporaryArray<PartitionCountsT> d_counts(nidx.size(), PartitionCountsT{});
+
+    // Partition the rows according to the operator
+    SortPositionBatchUnstable( dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),
+                       dh::ToSpan(d_counts), total_rows,op, 
+                      streams_[1]);
+    dh::safe_cuda(
+        cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
+                        sizeof(decltype(d_counts)::value_type) * d_counts.size(),
+                        cudaMemcpyDefault, streams_[1]));
+
+    dh::safe_cuda(cudaStreamSynchronize(streams_[1]));
+
+    // Update segments
+    for (int i = 0; i < nidx.size(); i++) {
+      auto segment = ridx_segments_.at(nidx[i]);
+      auto left_count = h_counts[i].first;
+      CHECK_LE(left_count, segment.Size());
+      CHECK_GE(left_count, 0);
+      ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
+                                     std::max(left_nidx[i], right_nidx[i]) + 1));
+      ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count);
+      ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end);
     }
   }
 };
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index c29032c5790d..21628aace40b 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -55,8 +55,7 @@ TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
   thrust::device_vector<uint32_t> ridx = ridx_in;
   thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
-  thrust::device_vector<unsigned long long int> left_counts(segments.size());
-  thrust::device_vector<IndexFlagTuple> scan_tmp(ridx_in.size());
+  thrust::device_vector<PartitionCountsT> counts(segments.size());
 
   auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
@@ -71,23 +70,19 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                 h_batch_info.size() * sizeof(KernelMemcpyArgs<int>),
                                 cudaMemcpyDefault, nullptr));
-  KernelBatchArgs<int> args;
-  std::copy(segments.begin(), segments.end(), args.segments);
-  std::copy(op_data.begin(), op_data.end(), args.data);
-  GetLeftCounts(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(scan_tmp),
-                dh::ToSpan(left_counts), op);
-  SortPositionBatch(args, dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp),
-                    dh::ToSpan(scan_tmp), dh::ToSpan(left_counts),op, nullptr);
+  SortPositionBatchUnstable(dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp),
+                     dh::ToSpan(counts), total_rows, op, nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (int i = 0; i < segments.size(); i++) {
     auto begin = ridx.begin() + segments[i].begin;
     auto end = ridx.begin() + segments[i].end;
+    PartitionCountsT count = counts[i];
     auto left_partition_count =
-        thrust::count_if(thrust::device, begin, begin + left_counts[i], op_without_data);
-    EXPECT_EQ(left_partition_count, left_counts[i]);
+        thrust::count_if(thrust::device, begin, begin + count.first, op_without_data);
+    EXPECT_EQ(left_partition_count, count.first);
     auto right_partition_count =
-        thrust::count_if(thrust::device, begin + left_counts[i], end, op_without_data);
+        thrust::count_if(thrust::device, begin + count.first, end, op_without_data);
     EXPECT_EQ(right_partition_count, 0);
   }
 }

From 14d866306832a677c821f247ff4dcada95338c6a Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 31 May 2022 04:25:44 -0700
Subject: [PATCH 42/64] Warp aggregates

---
 src/tree/gpu_hist/row_partitioner.cuh         | 87 ++++++++++++-------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  3 +-
 2 files changed, 57 insertions(+), 33 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index f062e2a4ed48..238a88a430e1 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -75,25 +75,6 @@ struct IndexFlagOp {
 };
 
 
-/*! \brief Count how many rows are assigned to left node. */
-__forceinline__ __device__ void AtomicIncrement(unsigned long long* d_counts, bool increment,
-                                                int batch_idx) {
-  int mask = __activemask();
-  int leader = __ffs(mask) - 1;
-  bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader));
-  // If all threads here are working on the same node
-  // we can do a more efficient reduction with warp intrinsics
-  if (group_is_contiguous) {
-    unsigned ballot = __ballot_sync(mask, increment);
-    if (threadIdx.x % 32 == leader) {
-      atomicAdd(d_counts + batch_idx,  // NOLINT
-                __popc(ballot));   // NOLINT
-    }
-  } else {
-    atomicAdd(d_counts + batch_idx, increment);
-  }
-}
-
 // This is a transformer output iterator
 // It takes the result of the scan and performs the partition
 // To understand how a scan is used to partition elements see:
@@ -167,34 +148,76 @@ void SortPositionBatch(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
   });
 }
 
+
+__forceinline__ __device__ uint32_t __lanemask_lt() { return ((uint32_t)1 << cub::LaneId()) - 1; }
+
+/*! \brief Count how many rows are assigned to left node. */
+__forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, bool go_left,
+                                                int16_t batch_idx) {
+  int mask = __activemask();
+  int leader = __ffs(mask) - 1;
+  unsigned int prefix = __popc(mask & __lanemask_lt());
+  bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader));
+  // If all threads here are working on the same node
+  // we can do a more efficient reduction with warp intrinsics
+  if (group_is_contiguous) {
+    unsigned ballot = __ballot_sync(mask, go_left);
+    uint32_t global_left_count = 0;
+    uint32_t global_right_count = 0;
+    if (prefix == 0) {
+      global_left_count = atomicAdd(&d_counts->first, __popc(ballot));
+      global_right_count = atomicAdd(&d_counts->second, __popc(mask) - __popc(ballot));
+    }
+    global_left_count = __shfl_sync(mask, global_left_count, leader);
+    global_right_count = __shfl_sync(mask, global_right_count, leader);
+    uint32_t local_left_count = __popc(ballot & __lanemask_lt());
+    uint32_t local_right_count = __popc(mask & __lanemask_lt()) - local_left_count;
+
+    if (go_left) {
+      return global_left_count + local_left_count;
+    } else {
+      return global_right_count + local_right_count;
+    }
+
+  } else {
+    auto address = go_left ? &d_counts->first : &d_counts->second;
+    return atomicAdd(address, 1);
+  }
+}
+
 template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel(
-    const common::Span<KernelMemcpyArgs<OpDataT>> batch_info, common::Span<RowIndexT> ridx,
+    const common::Span<KernelMemcpyArgs<OpDataT>> d_batch_info, common::Span<RowIndexT> d_ridx,
     common::Span<RowIndexT> ridx_tmp, common::Span<PartitionCountsT> counts, OpT op,
     std::size_t total_rows) {
+  __shared__ KernelMemcpyArgs<OpDataT> s_batch_info[32];
+  for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) {
+    s_batch_info[i] = d_batch_info.data()[i];
+  }
+  const common::Span<KernelMemcpyArgs<OpDataT>> batch_info(s_batch_info, d_batch_info.size());
+  __syncthreads();
+
   for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) {
     int16_t batch_idx;
     std::size_t item_idx;
     OpDataT data;
     AssignBatch(batch_info, idx, batch_idx, item_idx, data);
     auto segment = batch_info[batch_idx].segment;
-    auto op_res = op(ridx[item_idx], data);
-    if (op_res) {
-      auto num_left_items = atomicAdd(&counts.data()[batch_idx].first, 1);
-      ridx_tmp[segment.begin + num_left_items] = ridx[item_idx];
-    } else {
-      auto num_right_items = atomicAdd(&counts.data()[batch_idx].second, 1);
-      ridx_tmp[segment.end - num_right_items - 1] = ridx[item_idx];
-    }
+    auto ridx = d_ridx[item_idx];
+    auto op_res = op(ridx, data);
+    auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx);
+    auto destination_address =
+        op_res ? segment.begin + current_num_items : segment.end - current_num_items - 1;
+    ridx_tmp[destination_address] = ridx;
   }
 }
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatchUnstable(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
-                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
-                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
-                       OpT op, cudaStream_t stream) {
-                    
+                               common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
+                               common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
+                               OpT op, cudaStream_t stream) {
+  CHECK_LE(batch_info.size(), 32);
   constexpr int kBlockSize = 256;
   const int grid_size =
       std::max(256, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 21628aace40b..3ff18a016f33 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -94,6 +94,7 @@ TEST(GpuHist, SortPositionBatch) {
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
 
+/*
 void TestAtomicIncrement(const std::vector<int>& group_in, const std::vector<int>& increment_in) {
   thrust::device_vector<int> group(group_in);
   thrust::device_vector<int> increment(increment_in);
@@ -115,6 +116,6 @@ void TestAtomicIncrement(const std::vector<int>& group_in, const std::vector<int
 TEST(GpuHist, AtomicIncrement) {
   TestAtomicIncrement({0, 0, 0}, {1, 0, 1});
   TestAtomicIncrement({0, 0, 1}, {1, 0, 1});
-}
+}*/
 }  // namespace tree
 }  // namespace xgboost

From ec968f7eda4030a2057bfe3047ec110e1e60cdfc Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 1 Jun 2022 02:54:05 -0700
Subject: [PATCH 43/64] Cleanup

---
 src/tree/gpu_hist/row_partitioner.cu          |   9 +-
 src/tree/gpu_hist/row_partitioner.cuh         | 124 ++----------------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  23 ----
 3 files changed, 13 insertions(+), 143 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 7676c8e67495..cc117ae743e0 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -16,17 +16,12 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(Segment(0, num_rows));
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
-  streams_.resize(2);
-  for (auto& stream : streams_) {
-    dh::safe_cuda(cudaStreamCreate(&stream));
-  }
+  dh::safe_cuda(cudaStreamCreate(&stream_));
 }
 
 RowPartitioner::~RowPartitioner() {
   dh::safe_cuda(cudaSetDevice(device_idx_));
-  for (auto& stream : streams_) {
-    dh::safe_cuda(cudaStreamDestroy(stream));
-  }
+  dh::safe_cuda(cudaStreamDestroy(stream_));
 }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 238a88a430e1..ca59fff337ab 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -51,117 +51,18 @@ __device__ void AssignBatch(const common::Span<KernelMemcpyArgs<OpDataT>> batch_
   }
 }
 
-// We can scan over this tuple, where the scan gives us information on how to partition inputs
-// according to the flag
-struct IndexFlagTuple {
-  bst_uint idx;            // The location of the item we are working on in ridx_
-  bst_uint flag_scan;      // This gets populated after scanning
-  bst_uint segment_start;  // Start offset of this node segment
-  bst_uint segment_end;  // End offset of this node segment
-  int16_t batch_idx;       // Which node in the batch does this item belong to
-  bool flag;               // Result of op (is this item going left?)
-};
-
-struct IndexFlagOp {
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
-    // Segmented scan - resets if we cross batch boundaries
-    if (a.batch_idx == b.batch_idx) {
-      // Accumulate the flags, everything else stays the same
-      return {b.idx, a.flag_scan + b.flag_scan, b.segment_start, b.segment_end,b.batch_idx, b.flag};
-    } else {
-      return b;
-    }
-  }
-};
-
-
-// This is a transformer output iterator
-// It takes the result of the scan and performs the partition
-// To understand how a scan is used to partition elements see:
-// Harris, Mark, Shubhabrata Sengupta, and John D. Owens. "Parallel prefix sum (scan) with CUDA."
-// GPU gems 3.39 (2007): 851-876.
-struct WriteResultsFunctor {
-  bst_uint* ridx_in;
-  bst_uint* ridx_out;
-  PartitionCountsT *counts;
-
-  __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
-    std::size_t scatter_address;
-    if (x.flag) {
-      bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan
-      scatter_address = x.segment_start + num_previous_flagged;  
-    } else {
-
-      bst_uint num_previous_unflagged = (x.idx - x.segment_start) - x.flag_scan;
-      scatter_address = x.segment_end - num_previous_unflagged - 1;
-    }
-    ridx_out[scatter_address] = ridx_in[x.idx];
-
-    if (x.idx == (x.segment_end - 1)) {
-      // Write out counts
-      counts[x.batch_idx] = {x.flag_scan,0};
-    }
-
-    // Discard
-    return {};
-  }
-};
-
-template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatch(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
-                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
-                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
-                       OpT op, cudaStream_t stream) {
-  WriteResultsFunctor write_results{ridx.data(), ridx_tmp.data(), d_counts.data()};
-
-  auto discard_write_iterator =
-      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
-  auto counting = thrust::make_counting_iterator(0llu);
-  auto input_iterator =
-      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
-        int16_t batch_idx;
-        std::size_t item_idx;
-        OpDataT data;
-        AssignBatch(batch_info, idx, batch_idx, item_idx, data);
-        auto op_res = op(ridx[item_idx], data);
-        return IndexFlagTuple{bst_uint(item_idx),
-                              op_res,
-                              bst_uint(batch_info.data()[batch_idx].segment.begin),
-                              bst_uint(batch_info.data()[batch_idx].segment.end),
-                              batch_idx,
-                              op_res};
-      });
-  size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                 IndexFlagOp(), total_rows, stream);
-  dh::TemporaryArray<int8_t> temp(temp_bytes);
-  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
-                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
-
-  // copy active segments back to original buffer
-  dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) {
-    int16_t batch_idx;
-    std::size_t item_idx;
-    OpDataT data;
-    AssignBatch(batch_info, idx, batch_idx, item_idx, data);
-    ridx[item_idx] = ridx_tmp[item_idx];
-  });
-}
-
-
 __forceinline__ __device__ uint32_t __lanemask_lt() { return ((uint32_t)1 << cub::LaneId()) - 1; }
 
-/*! \brief Count how many rows are assigned to left node. */
 __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, bool go_left,
                                                 int16_t batch_idx) {
   int mask = __activemask();
   int leader = __ffs(mask) - 1;
-  unsigned int prefix = __popc(mask & __lanemask_lt());
+  uint32_t prefix = __popc(mask & __lanemask_lt());
   bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader));
   // If all threads here are working on the same node
   // we can do a more efficient reduction with warp intrinsics
   if (group_is_contiguous) {
-    unsigned ballot = __ballot_sync(mask, go_left);
+    uint32_t ballot = __ballot_sync(mask, go_left);
     uint32_t global_left_count = 0;
     uint32_t global_right_count = 0;
     if (prefix == 0) {
@@ -173,11 +74,7 @@ __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts,
     uint32_t local_left_count = __popc(ballot & __lanemask_lt());
     uint32_t local_right_count = __popc(mask & __lanemask_lt()) - local_left_count;
 
-    if (go_left) {
-      return global_left_count + local_left_count;
-    } else {
-      return global_right_count + local_right_count;
-    }
+    return go_left ? global_left_count + local_left_count : global_right_count + local_right_count;
 
   } else {
     auto address = go_left ? &d_counts->first : &d_counts->second;
@@ -185,7 +82,7 @@ __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts,
   }
 }
 
-template <int kBlockSize, typename RowIndexT, typename OpT, typename OpDataT>
+template <int kBlockSize, int kItemsPerThread = 1, typename RowIndexT, typename OpT, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel(
     const common::Span<KernelMemcpyArgs<OpDataT>> d_batch_info, common::Span<RowIndexT> d_ridx,
     common::Span<RowIndexT> ridx_tmp, common::Span<PartitionCountsT> counts, OpT op,
@@ -197,7 +94,8 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel(
   const common::Span<KernelMemcpyArgs<OpDataT>> batch_info(s_batch_info, d_batch_info.size());
   __syncthreads();
 
-  for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows; idx += blockDim.x * gridDim.x) {
+  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows;
+       idx += blockDim.x * gridDim.x) {
     int16_t batch_idx;
     std::size_t item_idx;
     OpDataT data;
@@ -262,7 +160,7 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
-  std::vector<cudaStream_t> streams_;
+  cudaStream_t stream_;
 
  public:
   RowPartitioner(int device_idx, size_t num_rows);
@@ -305,7 +203,7 @@ class RowPartitioner {
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(KernelMemcpyArgs<OpDataT>),
-                                  cudaMemcpyDefault, streams_[1]));
+                                  cudaMemcpyDefault, stream_));
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<PartitionCountsT>(nidx.size(), PartitionCountsT{});
@@ -314,13 +212,13 @@ class RowPartitioner {
     // Partition the rows according to the operator
     SortPositionBatchUnstable( dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),
                        dh::ToSpan(d_counts), total_rows,op, 
-                      streams_[1]);
+                      stream_);
     dh::safe_cuda(
         cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
                         sizeof(decltype(d_counts)::value_type) * d_counts.size(),
-                        cudaMemcpyDefault, streams_[1]));
+                        cudaMemcpyDefault, stream_));
 
-    dh::safe_cuda(cudaStreamSynchronize(streams_[1]));
+    dh::safe_cuda(cudaStreamSynchronize(stream_));
 
     // Update segments
     for (int i = 0; i < nidx.size(); i++) {
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 3ff18a016f33..d0e0f850a191 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -94,28 +94,5 @@ TEST(GpuHist, SortPositionBatch) {
   TestSortPositionBatch({0, 1, 2, 3, 4, 5}, {{3, 6}, {0, 2}});
 }
 
-/*
-void TestAtomicIncrement(const std::vector<int>& group_in, const std::vector<int>& increment_in) {
-  thrust::device_vector<int> group(group_in);
-  thrust::device_vector<int> increment(increment_in);
-  thrust::device_vector<unsigned long long> reference(group_in.size());
-  thrust::device_vector<unsigned long long> result(group_in.size());
-
-  auto d_group = group.data().get();
-  auto d_increment = increment.data().get();
-  auto d_reference = reference.data().get();
-  auto d_result = result.data().get();
-  dh::LaunchN(group.size(), [=] __device__(std::size_t idx) {
-    AtomicIncrement(d_result, d_increment[idx], d_group[idx]);
-    atomicAdd(d_reference + d_group[idx], d_increment[idx]);
-  });
-
-  EXPECT_EQ(reference, result);
-}
-
-TEST(GpuHist, AtomicIncrement) {
-  TestAtomicIncrement({0, 0, 0}, {1, 0, 1});
-  TestAtomicIncrement({0, 0, 1}, {1, 0, 1});
-}*/
 }  // namespace tree
 }  // namespace xgboost

From a764986612d46b0ebc82c346978bd184ece45811 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 1 Jun 2022 06:23:43 -0700
Subject: [PATCH 44/64] Use pointer for shared memory

---
 src/tree/gpu_hist/row_partitioner.cuh         | 104 +++++++++++-------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  11 +-
 2 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index ca59fff337ab..f44fd8d0962e 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -28,26 +28,25 @@ struct Segment {
 
 using PartitionCountsT = thrust::pair<bst_uint,bst_uint>;
 
+// TODO(Rory): Can be larger. To be tuned alongside other batch operations.
+static const int kMaxUpdatePositionBatchSize = 32;
 template <typename OpDataT>
-struct KernelMemcpyArgs {
+struct PerNodeData {
   Segment segment;
   OpDataT data;
 };
 
 template <typename OpDataT>
-__device__ void AssignBatch(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
-                            std::size_t idx, int16_t& batch_idx, std::size_t& item_idx, OpDataT&data) {
-  const auto ptr = batch_info.data();
+__device__ __forceinline__ void AssignBatch(const PerNodeData<OpDataT> *batch_info,
+                            std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) {
   std::size_t sum = 0;
-
-  for (int16_t i = 0; i < batch_info.size(); i++) {
-    if (sum + ptr[i].segment.Size() > idx) {
-      batch_idx = i;
-      item_idx = (idx - sum) + ptr[i].segment.begin;
-      data = ptr[i].data;
+  for (int16_t i = 0; i < kMaxUpdatePositionBatchSize; i++) {
+    if (sum + batch_info[i].segment.Size() > global_thread_idx) {
+      *batch_idx = i;
+      *item_idx = (global_thread_idx - sum) + batch_info[i].segment.begin;
       break;
     }
-    sum += ptr[i].segment.Size();
+    sum += batch_info[i].segment.Size();
   }
 }
 
@@ -82,40 +81,70 @@ __forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts,
   }
 }
 
-template <int kBlockSize, int kItemsPerThread = 1, typename RowIndexT, typename OpT, typename OpDataT>
+template <typename OpDataT>
+struct SharedStorage {
+  PerNodeData<OpDataT> data[kMaxUpdatePositionBatchSize];
+  // Collectively load from global memory into shared memory
+  template <int kBlockSize>
+  __device__ const PerNodeData<OpDataT>* BlockLoad(
+      const common::Span<const PerNodeData<OpDataT>> d_batch_info) {
+    for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) {
+      data[i] = d_batch_info.data()[i];
+    }
+    __syncthreads();
+    return data;
+  }
+};
+
+template <int kBlockSize, typename RowIndexT, typename OpT,
+          typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel(
-    const common::Span<KernelMemcpyArgs<OpDataT>> d_batch_info, common::Span<RowIndexT> d_ridx,
+    const common::Span<const PerNodeData<OpDataT>> d_batch_info, common::Span<RowIndexT> d_ridx,
     common::Span<RowIndexT> ridx_tmp, common::Span<PartitionCountsT> counts, OpT op,
     std::size_t total_rows) {
-  __shared__ KernelMemcpyArgs<OpDataT> s_batch_info[32];
-  for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) {
-    s_batch_info[i] = d_batch_info.data()[i];
-  }
-  const common::Span<KernelMemcpyArgs<OpDataT>> batch_info(s_batch_info, d_batch_info.size());
-  __syncthreads();
+  // Initialise shared memory this way to avoid calling constructors
+  __shared__ cub::Uninitialized<SharedStorage<OpDataT>> shared;
+  auto batch_info = shared.Alias().BlockLoad<kBlockSize>(d_batch_info);
 
   for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows;
        idx += blockDim.x * gridDim.x) {
-    int16_t batch_idx;
+    int batch_idx;
     std::size_t item_idx;
-    OpDataT data;
-    AssignBatch(batch_info, idx, batch_idx, item_idx, data);
-    auto segment = batch_info[batch_idx].segment;
+    AssignBatch(batch_info, idx,&batch_idx, &item_idx);
     auto ridx = d_ridx[item_idx];
-    auto op_res = op(ridx, data);
+    auto op_res = op(ridx, batch_info[batch_idx].data);
     auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx);
+    auto segment = batch_info[batch_idx].segment;
     auto destination_address =
         op_res ? segment.begin + current_num_items : segment.end - current_num_items - 1;
     ridx_tmp[destination_address] = ridx;
   }
 }
 
+template <int kBlockSize, typename RowIndexT, typename OpDataT>
+__global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel(
+    const common::Span<const PerNodeData<OpDataT>> d_batch_info, common::Span<RowIndexT> d_ridx,
+    common::Span<RowIndexT> ridx_tmp,
+    std::size_t total_rows) {
+      
+  __shared__ cub::Uninitialized<SharedStorage<OpDataT>> shared;
+  auto batch_info = shared.Alias().BlockLoad<kBlockSize>(d_batch_info);
+  
+  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows;
+       idx += blockDim.x * gridDim.x) {
+    int batch_idx;
+    std::size_t item_idx;
+    AssignBatch(batch_info, idx,&batch_idx, &item_idx);
+    d_ridx[item_idx] = ridx_tmp[item_idx];
+  }
+}
+
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatchUnstable(const common::Span<KernelMemcpyArgs<OpDataT>> batch_info,
+void SortPositionBatchUnstable(const common::Span<const PerNodeData<OpDataT>> batch_info,
                                common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
                                common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
                                OpT op, cudaStream_t stream) {
-  CHECK_LE(batch_info.size(), 32);
+  CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize);
   constexpr int kBlockSize = 256;
   const int grid_size =
       std::max(256, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
@@ -123,14 +152,8 @@ void SortPositionBatchUnstable(const common::Span<KernelMemcpyArgs<OpDataT>> bat
   SortPositionBatchUnstableKernel<kBlockSize>
       <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows);
 
-  // copy active segments back to original buffer
-  dh::LaunchN(total_rows, stream, [=] __device__(std::size_t idx) {
-    int16_t batch_idx;
-    std::size_t item_idx;
-    OpDataT data;
-    AssignBatch(batch_info, idx, batch_idx, item_idx, data);
-    ridx[item_idx] = ridx_tmp[item_idx];
-  });
+  SortPositionCopyKernel<kBlockSize>
+      <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, total_rows);
 }
 
 /** \brief Class responsible for tracking subsets of rows as we add splits and
@@ -193,8 +216,8 @@ class RowPartitioner {
     CHECK_EQ(nidx.size(), right_nidx.size());
     CHECK_EQ(nidx.size(), op_data.size());
 
-    auto h_batch_info = pinned2_.GetSpan<KernelMemcpyArgs<OpDataT>>(nidx.size());
-    dh::TemporaryArray<KernelMemcpyArgs<OpDataT>> d_batch_info(nidx.size());
+    auto h_batch_info = pinned2_.GetSpan<PerNodeData<OpDataT>>(nidx.size());
+    dh::TemporaryArray<PerNodeData<OpDataT>> d_batch_info(nidx.size());
 
     std::size_t total_rows = 0;
     for (int i = 0; i < nidx.size(); i++) {
@@ -202,7 +225,7 @@ class RowPartitioner {
       total_rows += ridx_segments_.at(nidx.at(i)).Size();
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
-                                  h_batch_info.size() * sizeof(KernelMemcpyArgs<OpDataT>),
+                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
                                   cudaMemcpyDefault, stream_));
 
     // Temporary arrays
@@ -210,9 +233,10 @@ class RowPartitioner {
     dh::TemporaryArray<PartitionCountsT> d_counts(nidx.size(), PartitionCountsT{});
 
     // Partition the rows according to the operator
-    SortPositionBatchUnstable( dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),
-                       dh::ToSpan(d_counts), total_rows,op, 
-                      stream_);
+    SortPositionBatchUnstable(common::Span<const PerNodeData<OpDataT>>(
+                                  d_batch_info.data().get(), d_batch_info.size()),
+                              dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
+                              total_rows, op, stream_);
     dh::safe_cuda(
         cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
                         sizeof(decltype(d_counts)::value_type) * d_counts.size(),
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index d0e0f850a191..92bccff35330 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -59,8 +59,8 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
 
   auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
-  std::vector<KernelMemcpyArgs<int>> h_batch_info(segments.size());
-  dh::TemporaryArray<KernelMemcpyArgs<int>> d_batch_info(segments.size());
+  std::vector<PerNodeData<int>> h_batch_info(segments.size());
+  dh::TemporaryArray<PerNodeData<int>> d_batch_info(segments.size());
 
   std::size_t total_rows = 0;
   for (int i = 0; i < segments.size(); i++) {
@@ -68,10 +68,11 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
     total_rows += segments.at(i).Size();
   }
   dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
-                                h_batch_info.size() * sizeof(KernelMemcpyArgs<int>),
+                                h_batch_info.size() * sizeof(PerNodeData<int>),
                                 cudaMemcpyDefault, nullptr));
-  SortPositionBatchUnstable(dh::ToSpan(d_batch_info), dh::ToSpan(ridx), dh::ToSpan(ridx_tmp),
-                     dh::ToSpan(counts), total_rows, op, nullptr);
+  SortPositionBatchUnstable(
+      common::Span<const PerNodeData<int>>(d_batch_info.data().get(), d_batch_info.size()),
+      dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op, nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (int i = 0; i < segments.size(); i++) {

From 001c2f267ef8e7bd722e0ee66a93da4a6d823e12 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 2 Jun 2022 08:08:44 -0700
Subject: [PATCH 45/64] Row partitioner grid

---
 src/tree/gpu_hist/row_partitioner.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index f44fd8d0962e..70920ffcc3b3 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -110,7 +110,7 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel(
        idx += blockDim.x * gridDim.x) {
     int batch_idx;
     std::size_t item_idx;
-    AssignBatch(batch_info, idx,&batch_idx, &item_idx);
+    AssignBatch(batch_info, idx, &batch_idx, &item_idx);
     auto ridx = d_ridx[item_idx];
     auto op_res = op(ridx, batch_info[batch_idx].data);
     auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx);
@@ -147,7 +147,7 @@ void SortPositionBatchUnstable(const common::Span<const PerNodeData<OpDataT>> ba
   CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize);
   constexpr int kBlockSize = 256;
   const int grid_size =
-      std::max(256, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
+      std::min(256, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
 
   SortPositionBatchUnstableKernel<kBlockSize>
       <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows);

From 70bad86552e6fe40b0c6434c9623426fb003e13e Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 2 Jun 2022 08:09:57 -0700
Subject: [PATCH 46/64] Custom FinalizePositionKernel

---
 src/tree/gpu_hist/finalize_position.cuh | 113 ++++++++++++++++++++++++
 src/tree/updater_gpu_hist.cu            |  62 ++-----------
 2 files changed, 118 insertions(+), 57 deletions(-)
 create mode 100644 src/tree/gpu_hist/finalize_position.cuh

diff --git a/src/tree/gpu_hist/finalize_position.cuh b/src/tree/gpu_hist/finalize_position.cuh
new file mode 100644
index 000000000000..0f5ec36f649f
--- /dev/null
+++ b/src/tree/gpu_hist/finalize_position.cuh
@@ -0,0 +1,113 @@
+/*!
+ * Copyright 2017-2022 XGBoost contributors
+ */
+#pragma once
+#include "xgboost/base.h"
+#include "xgboost/data.h"
+#include "xgboost/span.h"
+#include "../../data/ellpack_page.cuh"
+
+namespace xgboost {
+namespace tree {
+
+template <int kBlockSize, bool kUseShared>
+__device__ const RegTree::Node *LoadTree(common::Span<const RegTree::Node> d_nodes, int *smem) {
+  if (!kUseShared) {
+    return d_nodes.data();
+  }
+
+  auto nodes = reinterpret_cast<RegTree::Node*>(smem);
+  for (int i = threadIdx.x; i < d_nodes.size(); i += kBlockSize) {
+      nodes[i]=d_nodes[i];
+  }
+  __syncthreads();
+  return nodes;
+}
+
+template <int kBlockSize, bool kUseShared>
+__global__ __launch_bounds__(kBlockSize) void FinalizePositionKernel(
+    common::Span<const RegTree::Node> d_nodes, common::Span<const FeatureType> feature_types,
+    common::Span<const uint32_t> categories,
+    common::Span<const RegTree::Segment> categories_segments,
+    common::Span<const GradientPair> gradients, const EllpackDeviceAccessor dmatrix,
+    common::Span<bst_float> predictions, common::Span<bst_node_t> position) {
+  extern __shared__ int s[];
+  auto nodes = LoadTree<kBlockSize,kUseShared>(d_nodes, s);
+  auto new_position_op = [&] __device__(size_t row_id) {
+    // What happens if user prune the tree?
+    if (!dmatrix.IsInRange(row_id)) {
+      return -1;
+    }
+    int row_position = RegTree::kRoot;
+    auto node = nodes[row_position];
+
+    while (!node.IsLeaf()) {
+      bst_float element = dmatrix.GetFvalue(row_id, node.SplitIndex());
+      // Missing value
+      if (isnan(element)) {
+        row_position = node.DefaultChild();
+      } else {
+        bool go_left = true;
+        if (common::IsCat(feature_types, row_position)) {
+          auto node_cats = categories.subspan(categories_segments[row_position].beg,
+                                              categories_segments[row_position].size);
+          go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
+        } else {
+          go_left = element <= node.SplitCond();
+        }
+        if (go_left) {
+          row_position = node.LeftChild();
+        } else {
+          row_position = node.RightChild();
+        }
+      }
+      node = nodes[row_position];
+    }
+
+    return row_position;
+  };  // NOLINT
+
+  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < position.size();
+       idx += blockDim.x * gridDim.x) {
+      bst_node_t row_position = new_position_op(idx);
+      predictions[idx] = nodes[row_position].LeafValue();
+      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
+      // the sampler compacts the gradient vector.
+      bool is_sampled = gradients[idx].GetHess() - .0f == 0.f;
+      position[idx] = is_sampled ? ~row_position : row_position;
+  }
+}
+
+inline void CallFinalizePosition(common::Span<const RegTree::Node> nodes,
+                              common::Span<const FeatureType> feature_types,
+                              common::Span<const uint32_t> categories,
+                              common::Span<const RegTree::Segment> categories_segments,
+                              common::Span<const GradientPair> gradients,
+                              const EllpackDeviceAccessor dmatrix,
+                              common::Span<bst_float> predictions,
+                              common::Span<bst_node_t> position){
+
+  // Use shared memory?
+  int device = 0;
+  dh::safe_cuda(cudaGetDevice(&device));
+  int max_shared_memory = dh::MaxSharedMemoryOptin(device);
+  size_t smem_size = sizeof( RegTree::Node) *
+                     nodes.size();
+  bool shared = smem_size <= max_shared_memory;
+  smem_size = shared ? smem_size : 0;
+  constexpr int kBlockSize = 256;
+  const int grid_size =
+      std::min(256, static_cast<int>(xgboost::common::DivRoundUp(position.size(), kBlockSize)));
+
+  if (shared) {
+    FinalizePositionKernel<kBlockSize, true>
+        <<<grid_size, kBlockSize, smem_size>>>(nodes, feature_types, categories, categories_segments,
+                                    gradients, dmatrix, predictions, position);
+  } else {
+    FinalizePositionKernel<kBlockSize, false>
+        <<<grid_size, kBlockSize, smem_size>>>(nodes, feature_types, categories, categories_segments,
+                                    gradients, dmatrix, predictions, position);
+  }
+}
+};  // namespace tree
+};  // namespace xgboost
\ No newline at end of file
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 3d1c38ba51d7..a6a186b61bbd 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -38,6 +38,7 @@
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/evaluate_splits.cuh"
 #include "gpu_hist/expand_entry.cuh"
+#include "gpu_hist/finalize_position.cuh"
 #include "xgboost/task.h"
 #include "xgboost/tree_model.h"
 
@@ -436,66 +437,13 @@ struct GPUHistMakerDevice {
       dh::CopyToD(categories_segments, &d_categories_segments);
     }
 
-    FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                           dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
-                           p_out_position);
-  }
-
-  void FinalisePositionInPage(EllpackPageImpl const *page,
-                              const common::Span<RegTree::Node> d_nodes,
-                              common::Span<FeatureType const> d_feature_types,
-                              common::Span<uint32_t const> categories,
-                              common::Span<RegTree::Segment> categories_segments,
-                              ObjInfo task,
-                              HostDeviceVector<bst_node_t>* p_out_position) {
-    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
-    auto d_gpair = this->gpair;
-    auto new_position_op = [=] __device__(size_t row_id) {
-      // What happens if user prune the tree?
-      if (!d_matrix.IsInRange(row_id)) {
-        return -1;
-      }
-      int position = RegTree::kRoot;
-      auto node = d_nodes[position];
-
-      while (!node.IsLeaf()) {
-        bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
-        // Missing value
-        if (isnan(element)) {
-          position = node.DefaultChild();
-        } else {
-          bool go_left = true;
-          if (common::IsCat(d_feature_types, position)) {
-            auto node_cats = categories.subspan(categories_segments[position].beg,
-                                                categories_segments[position].size);
-            go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
-          } else {
-            go_left = element <= node.SplitCond();
-          }
-          if (go_left) {
-            position = node.LeftChild();
-          } else {
-            position = node.RightChild();
-          }
-        }
-        node = d_nodes[position];
-      }
-
-      return position;
-    }; // NOLINT
     p_out_position->SetDevice(ctx_->gpu_id);
     p_out_position->Resize(page->n_rows);
     update_predictions.resize(page->n_rows);
-    auto d_update_predictions = dh::ToSpan(update_predictions);
-    auto sorted_position = p_out_position->DevicePointer();
-    dh::LaunchN(page->n_rows, [=] __device__(size_t idx) {
-      bst_node_t position = new_position_op(idx);
-      d_update_predictions[idx] = d_nodes[position].LeafValue();
-      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
-      // the sampler compacts the gradient vector.
-      bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
-      sorted_position[idx] = is_sampled ? ~position : position;
-    });
+    CallFinalizePosition(dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), dh::ToSpan(d_categories),
+                         dh::ToSpan(d_categories_segments), this->gpair,
+                         page->GetDeviceAccessor(ctx_->gpu_id), dh::ToSpan(update_predictions),
+                         p_out_position->DeviceSpan());
   }
 
   bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {

From 31e02f0614145b020274617bc3917f26aabad13a Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 2 Jun 2022 08:12:08 -0700
Subject: [PATCH 47/64] Revert "Custom FinalizePositionKernel"

This reverts commit 70bad86552e6fe40b0c6434c9623426fb003e13e.
---
 src/tree/gpu_hist/finalize_position.cuh | 113 ------------------------
 src/tree/updater_gpu_hist.cu            |  62 +++++++++++--
 2 files changed, 57 insertions(+), 118 deletions(-)
 delete mode 100644 src/tree/gpu_hist/finalize_position.cuh

diff --git a/src/tree/gpu_hist/finalize_position.cuh b/src/tree/gpu_hist/finalize_position.cuh
deleted file mode 100644
index 0f5ec36f649f..000000000000
--- a/src/tree/gpu_hist/finalize_position.cuh
+++ /dev/null
@@ -1,113 +0,0 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
- */
-#pragma once
-#include "xgboost/base.h"
-#include "xgboost/data.h"
-#include "xgboost/span.h"
-#include "../../data/ellpack_page.cuh"
-
-namespace xgboost {
-namespace tree {
-
-template <int kBlockSize, bool kUseShared>
-__device__ const RegTree::Node *LoadTree(common::Span<const RegTree::Node> d_nodes, int *smem) {
-  if (!kUseShared) {
-    return d_nodes.data();
-  }
-
-  auto nodes = reinterpret_cast<RegTree::Node*>(smem);
-  for (int i = threadIdx.x; i < d_nodes.size(); i += kBlockSize) {
-      nodes[i]=d_nodes[i];
-  }
-  __syncthreads();
-  return nodes;
-}
-
-template <int kBlockSize, bool kUseShared>
-__global__ __launch_bounds__(kBlockSize) void FinalizePositionKernel(
-    common::Span<const RegTree::Node> d_nodes, common::Span<const FeatureType> feature_types,
-    common::Span<const uint32_t> categories,
-    common::Span<const RegTree::Segment> categories_segments,
-    common::Span<const GradientPair> gradients, const EllpackDeviceAccessor dmatrix,
-    common::Span<bst_float> predictions, common::Span<bst_node_t> position) {
-  extern __shared__ int s[];
-  auto nodes = LoadTree<kBlockSize,kUseShared>(d_nodes, s);
-  auto new_position_op = [&] __device__(size_t row_id) {
-    // What happens if user prune the tree?
-    if (!dmatrix.IsInRange(row_id)) {
-      return -1;
-    }
-    int row_position = RegTree::kRoot;
-    auto node = nodes[row_position];
-
-    while (!node.IsLeaf()) {
-      bst_float element = dmatrix.GetFvalue(row_id, node.SplitIndex());
-      // Missing value
-      if (isnan(element)) {
-        row_position = node.DefaultChild();
-      } else {
-        bool go_left = true;
-        if (common::IsCat(feature_types, row_position)) {
-          auto node_cats = categories.subspan(categories_segments[row_position].beg,
-                                              categories_segments[row_position].size);
-          go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
-        } else {
-          go_left = element <= node.SplitCond();
-        }
-        if (go_left) {
-          row_position = node.LeftChild();
-        } else {
-          row_position = node.RightChild();
-        }
-      }
-      node = nodes[row_position];
-    }
-
-    return row_position;
-  };  // NOLINT
-
-  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < position.size();
-       idx += blockDim.x * gridDim.x) {
-      bst_node_t row_position = new_position_op(idx);
-      predictions[idx] = nodes[row_position].LeafValue();
-      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
-      // the sampler compacts the gradient vector.
-      bool is_sampled = gradients[idx].GetHess() - .0f == 0.f;
-      position[idx] = is_sampled ? ~row_position : row_position;
-  }
-}
-
-inline void CallFinalizePosition(common::Span<const RegTree::Node> nodes,
-                              common::Span<const FeatureType> feature_types,
-                              common::Span<const uint32_t> categories,
-                              common::Span<const RegTree::Segment> categories_segments,
-                              common::Span<const GradientPair> gradients,
-                              const EllpackDeviceAccessor dmatrix,
-                              common::Span<bst_float> predictions,
-                              common::Span<bst_node_t> position){
-
-  // Use shared memory?
-  int device = 0;
-  dh::safe_cuda(cudaGetDevice(&device));
-  int max_shared_memory = dh::MaxSharedMemoryOptin(device);
-  size_t smem_size = sizeof( RegTree::Node) *
-                     nodes.size();
-  bool shared = smem_size <= max_shared_memory;
-  smem_size = shared ? smem_size : 0;
-  constexpr int kBlockSize = 256;
-  const int grid_size =
-      std::min(256, static_cast<int>(xgboost::common::DivRoundUp(position.size(), kBlockSize)));
-
-  if (shared) {
-    FinalizePositionKernel<kBlockSize, true>
-        <<<grid_size, kBlockSize, smem_size>>>(nodes, feature_types, categories, categories_segments,
-                                    gradients, dmatrix, predictions, position);
-  } else {
-    FinalizePositionKernel<kBlockSize, false>
-        <<<grid_size, kBlockSize, smem_size>>>(nodes, feature_types, categories, categories_segments,
-                                    gradients, dmatrix, predictions, position);
-  }
-}
-};  // namespace tree
-};  // namespace xgboost
\ No newline at end of file
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index a6a186b61bbd..3d1c38ba51d7 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -38,7 +38,6 @@
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/evaluate_splits.cuh"
 #include "gpu_hist/expand_entry.cuh"
-#include "gpu_hist/finalize_position.cuh"
 #include "xgboost/task.h"
 #include "xgboost/tree_model.h"
 
@@ -437,13 +436,66 @@ struct GPUHistMakerDevice {
       dh::CopyToD(categories_segments, &d_categories_segments);
     }
 
+    FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                           dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
+                           p_out_position);
+  }
+
+  void FinalisePositionInPage(EllpackPageImpl const *page,
+                              const common::Span<RegTree::Node> d_nodes,
+                              common::Span<FeatureType const> d_feature_types,
+                              common::Span<uint32_t const> categories,
+                              common::Span<RegTree::Segment> categories_segments,
+                              ObjInfo task,
+                              HostDeviceVector<bst_node_t>* p_out_position) {
+    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_gpair = this->gpair;
+    auto new_position_op = [=] __device__(size_t row_id) {
+      // What happens if user prune the tree?
+      if (!d_matrix.IsInRange(row_id)) {
+        return -1;
+      }
+      int position = RegTree::kRoot;
+      auto node = d_nodes[position];
+
+      while (!node.IsLeaf()) {
+        bst_float element = d_matrix.GetFvalue(row_id, node.SplitIndex());
+        // Missing value
+        if (isnan(element)) {
+          position = node.DefaultChild();
+        } else {
+          bool go_left = true;
+          if (common::IsCat(d_feature_types, position)) {
+            auto node_cats = categories.subspan(categories_segments[position].beg,
+                                                categories_segments[position].size);
+            go_left = common::Decision<false>(node_cats, element, node.DefaultLeft());
+          } else {
+            go_left = element <= node.SplitCond();
+          }
+          if (go_left) {
+            position = node.LeftChild();
+          } else {
+            position = node.RightChild();
+          }
+        }
+        node = d_nodes[position];
+      }
+
+      return position;
+    }; // NOLINT
     p_out_position->SetDevice(ctx_->gpu_id);
     p_out_position->Resize(page->n_rows);
     update_predictions.resize(page->n_rows);
-    CallFinalizePosition(dh::ToSpan(d_nodes), dh::ToSpan(d_split_types), dh::ToSpan(d_categories),
-                         dh::ToSpan(d_categories_segments), this->gpair,
-                         page->GetDeviceAccessor(ctx_->gpu_id), dh::ToSpan(update_predictions),
-                         p_out_position->DeviceSpan());
+    auto d_update_predictions = dh::ToSpan(update_predictions);
+    auto sorted_position = p_out_position->DevicePointer();
+    dh::LaunchN(page->n_rows, [=] __device__(size_t idx) {
+      bst_node_t position = new_position_op(idx);
+      d_update_predictions[idx] = d_nodes[position].LeafValue();
+      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
+      // the sampler compacts the gradient vector.
+      bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
+      sorted_position[idx] = is_sampled ? ~position : position;
+    });
   }
 
   bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {

From b86cb2930761651ed6769b20da18c92efb1cf1ff Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 2 Jun 2022 08:41:32 -0700
Subject: [PATCH 48/64] Reduce grid size

---
 src/tree/gpu_hist/row_partitioner.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 70920ffcc3b3..a8c1b09df161 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -147,7 +147,7 @@ void SortPositionBatchUnstable(const common::Span<const PerNodeData<OpDataT>> ba
   CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize);
   constexpr int kBlockSize = 256;
   const int grid_size =
-      std::min(256, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
+      std::min(128, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
 
   SortPositionBatchUnstableKernel<kBlockSize>
       <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows);

From c3944af9edc3a0a48bf429915784dd54a5c266ba Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Sat, 4 Jun 2022 12:24:30 -0700
Subject: [PATCH 49/64] Tune items/thread

---
 src/tree/gpu_hist/row_partitioner.cuh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a8c1b09df161..ad664d5cdab5 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -146,8 +146,10 @@ void SortPositionBatchUnstable(const common::Span<const PerNodeData<OpDataT>> ba
                                OpT op, cudaStream_t stream) {
   CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize);
   constexpr int kBlockSize = 256;
-  const int grid_size =
-      std::min(128, static_cast<int>(xgboost::common::DivRoundUp(total_rows, kBlockSize)));
+
+  // Value found by experimentation
+  const int kItemsThread = 12;
+  const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
   SortPositionBatchUnstableKernel<kBlockSize>
       <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows);

From cdd134ac1bca6a2ba21ba7bca5b8335422f27f09 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 6 Jun 2022 04:59:15 -0700
Subject: [PATCH 50/64] FinalisePosition custom kernel

---
 src/tree/gpu_hist/row_partitioner.cu  |   4 +-
 src/tree/gpu_hist/row_partitioner.cuh | 112 ++++++++++++++++++++++++--
 src/tree/updater_gpu_hist.cu          |  33 ++++----
 3 files changed, 125 insertions(+), 24 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index cc117ae743e0..53b6039dabd3 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -14,7 +14,7 @@ namespace tree {
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
     : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
-  ridx_segments_.emplace_back(Segment(0, num_rows));
+  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
   dh::safe_cuda(cudaStreamCreate(&stream_));
 }
@@ -26,7 +26,7 @@ RowPartitioner::~RowPartitioner() {
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
     bst_node_t nidx) {
-  auto segment = ridx_segments_.at(nidx);
+  auto segment = ridx_segments_.at(nidx).segment;
   // Return empty span here as a valid result
   // Will error if we try to construct a span from a pointer with size 0
   if (segment.Size() == 0) {
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index ad664d5cdab5..6461f0f98cd6 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -158,12 +158,54 @@ void SortPositionBatchUnstable(const common::Span<const PerNodeData<OpDataT>> ba
       <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, total_rows);
 }
 
+struct NodePositionInfo {
+  Segment segment;
+  int left_child = -1;
+  int right_child = -1;
+  __device__ bool IsLeaf() { return left_child == -1; }
+};
+
+__device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const NodePositionInfo* d_node_info) {
+  int position = 0;
+  NodePositionInfo node = d_node_info[position];
+  while (!node.IsLeaf()) {
+    NodePositionInfo left = d_node_info[node.left_child];
+    NodePositionInfo right = d_node_info[node.right_child];
+    if (idx >= left.segment.begin && idx < left.segment.end) {
+      position = node.left_child;
+      node = left;
+    } else if (idx >= right.segment.begin && idx < right.segment.end) {
+      position = node.right_child;
+      node = right;
+    } else {
+      KERNEL_CHECK(false);
+    }
+  }
+  return position;
+}
+
+template <int kBlockSize, typename RowIndexT, typename OpT, typename IsSampledOpT>
+__global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel(
+    const common::Span<const NodePositionInfo> d_node_info,
+    const common::Span<const RowIndexT> d_ridx, common::Span<bst_node_t> d_out_position, OpT op,
+    IsSampledOpT is_sampled) {
+  bst_node_t* out_ptr = d_out_position.data();
+  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size();
+       idx += blockDim.x * gridDim.x) {
+    auto position = GetPositionFromSegments(idx, d_node_info.data());
+    RowIndexT ridx = d_ridx.data()[idx];
+    bst_node_t new_position = op(ridx, position);
+    out_ptr[ridx] = is_sampled(ridx) ? ~new_position : new_position;
+  }
+}
+
 /** \brief Class responsible for tracking subsets of rows as we add splits and
  * partition training rows into different leaf nodes. */
 class RowPartitioner {
  public:
   using RowIndexT = bst_uint;
 
+
  private:
   int device_idx_;
   /*! \brief In here if you want to find the rows belong to a node nid, first you need to
@@ -174,7 +216,8 @@ class RowPartitioner {
    * node id -> segment -> indices of rows belonging to node
    */
   /*! \brief Range of row index for each node, pointers into ridx below. */
-  std::vector<Segment> ridx_segments_;
+
+  std::vector<NodePositionInfo> ridx_segments_;
   /*! \brief mapping for node id -> rows.
    * This looks like:
    * node id  |    1    |    2   |
@@ -223,8 +266,8 @@ class RowPartitioner {
 
     std::size_t total_rows = 0;
     for (int i = 0; i < nidx.size(); i++) {
-      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)), op_data.at(i)};
-      total_rows += ridx_segments_.at(nidx.at(i)).Size();
+      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)};
+      total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
                                   h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
@@ -248,16 +291,73 @@ class RowPartitioner {
 
     // Update segments
     for (int i = 0; i < nidx.size(); i++) {
-      auto segment = ridx_segments_.at(nidx[i]);
+      auto segment = ridx_segments_.at(nidx[i]).segment;
       auto left_count = h_counts[i].first;
       CHECK_LE(left_count, segment.Size());
       CHECK_GE(left_count, 0);
       ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
                                      std::max(left_nidx[i], right_nidx[i]) + 1));
-      ridx_segments_[left_nidx[i]] = Segment(segment.begin, segment.begin + left_count);
-      ridx_segments_[right_nidx[i]] = Segment(segment.begin + left_count, segment.end);
+      ridx_segments_[nidx[i]] = NodePositionInfo{segment, left_nidx[i], right_nidx[i]};
+      ridx_segments_[left_nidx[i]] =
+          NodePositionInfo{Segment(segment.begin, segment.begin + left_count)};
+      ridx_segments_[right_nidx[i]] =
+          NodePositionInfo{Segment(segment.begin + left_count, segment.end)};
     }
   }
+
+   /**
+   * \brief Finalise the position of all training instances after tree construction is
+   * complete. Does not update any other meta information in this data structure, so
+   * should only be used at the end of training.
+   *
+   *   When the task requires update leaf, this function will copy the node index into
+   *   p_out_position. The index is negated if it's being sampled in current iteration.
+   *
+   * \param p_out_position Node index for each row.
+   * \param op Device lambda. Should provide the row index and current position as an
+   *           argument and return the new position for this training instance.
+   * \param sampled A device lambda to inform the partitioner whether a row is sampled.
+   */
+  template <typename FinalisePositionOpT, typename Sampledp>
+  void FinalisePosition(
+                        common::Span<bst_node_t> d_out_position, FinalisePositionOpT op,
+                        Sampledp sampledp) {
+    dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
+    dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
+                                  sizeof(NodePositionInfo) * ridx_segments_.size(),
+                                  cudaMemcpyDefault, stream_));
+
+    auto d_node_info = d_node_info_storage.data().get();
+
+    auto current_position = [=] __device__(std::size_t idx) {
+      int position = 0;
+      NodePositionInfo node = d_node_info[position];
+      while (!node.IsLeaf()) {
+        NodePositionInfo left = d_node_info[node.left_child];
+        NodePositionInfo right = d_node_info[node.right_child];
+        if (idx >= left.segment.begin && idx < left.segment.end) {
+          position = node.left_child;
+          node = left;
+        } else if (idx >= right.segment.begin && idx < right.segment.end) {
+          position = node.right_child;
+          node = right;
+        } else {
+          KERNEL_CHECK(false);
+        }
+      }
+      return position;
+    };
+
+  constexpr int kBlockSize = 256;
+
+  // Value found by experimentation
+  const int kItemsThread = 12;
+  const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
+  common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
+  FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
+      dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op, sampledp);
+  }
 };
+
 };  // namespace tree
 };  // namespace xgboost
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 3d1c38ba51d7..8dc8ff97b120 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -450,12 +450,16 @@ struct GPUHistMakerDevice {
                               HostDeviceVector<bst_node_t>* p_out_position) {
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
     auto d_gpair = this->gpair;
-    auto new_position_op = [=] __device__(size_t row_id) {
+    update_predictions.resize(row_partitioner->GetRows().size());
+    auto d_update_predictions = dh::ToSpan(update_predictions);
+    p_out_position->SetDevice(ctx_->gpu_id);
+    p_out_position->Resize(row_partitioner->GetRows().size());
+
+    auto new_position_op = [=] __device__(size_t row_id, int position) {
       // What happens if user prune the tree?
       if (!d_matrix.IsInRange(row_id)) {
         return -1;
       }
-      int position = RegTree::kRoot;
       auto node = d_nodes[position];
 
       while (!node.IsLeaf()) {
@@ -478,24 +482,21 @@ struct GPUHistMakerDevice {
             position = node.RightChild();
           }
         }
+        
         node = d_nodes[position];
       }
 
+      d_update_predictions[row_id] = node.LeafValue();
       return position;
-    }; // NOLINT
-    p_out_position->SetDevice(ctx_->gpu_id);
-    p_out_position->Resize(page->n_rows);
-    update_predictions.resize(page->n_rows);
-    auto d_update_predictions = dh::ToSpan(update_predictions);
-    auto sorted_position = p_out_position->DevicePointer();
-    dh::LaunchN(page->n_rows, [=] __device__(size_t idx) {
-      bst_node_t position = new_position_op(idx);
-      d_update_predictions[idx] = d_nodes[position].LeafValue();
-      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
-      // the sampler compacts the gradient vector.
-      bool is_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
-      sorted_position[idx] = is_sampled ? ~position : position;
-    });
+    };  // NOLINT
+
+    auto is_sampled_op = [d_gpair] __device__(size_t ridx) {
+          // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
+          // the sampler compacts the gradient vector.
+          return d_gpair[ridx].GetHess() - .0f == 0.f;
+        };
+
+    row_partitioner->FinalisePosition(p_out_position->DeviceSpan(), new_position_op, is_sampled_op);
   }
 
   bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {

From edabc455063351761992d51d34178e642ae257a6 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 7 Jun 2022 13:31:56 -0700
Subject: [PATCH 51/64] Fixing slow scatter

---
 src/tree/gpu_hist/row_partitioner.cuh         | 26 +++++++++----------
 src/tree/updater_gpu_hist.cu                  | 15 ++++++-----
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  3 ++-
 3 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 6461f0f98cd6..e5a4a1849ab2 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -152,7 +152,7 @@ void SortPositionBatchUnstable(const common::Span<const PerNodeData<OpDataT>> ba
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
   SortPositionBatchUnstableKernel<kBlockSize>
-      <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, d_counts, op, total_rows);
+      <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp,d_counts, op, total_rows);
 
   SortPositionCopyKernel<kBlockSize>
       <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, total_rows);
@@ -184,18 +184,16 @@ __device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const No
   return position;
 }
 
-template <int kBlockSize, typename RowIndexT, typename OpT, typename IsSampledOpT>
+template <int kBlockSize, typename RowIndexT, typename OpT>
 __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel(
     const common::Span<const NodePositionInfo> d_node_info,
-    const common::Span<const RowIndexT> d_ridx, common::Span<bst_node_t> d_out_position, OpT op,
-    IsSampledOpT is_sampled) {
-  bst_node_t* out_ptr = d_out_position.data();
+    const common::Span<const RowIndexT> d_ridx,common::Span<bst_node_t> d_out_position, OpT op) {
   for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size();
        idx += blockDim.x * gridDim.x) {
     auto position = GetPositionFromSegments(idx, d_node_info.data());
-    RowIndexT ridx = d_ridx.data()[idx];
+    RowIndexT ridx = d_ridx[idx];
     bst_node_t new_position = op(ridx, position);
-    out_ptr[ridx] = is_sampled(ridx) ? ~new_position : new_position;
+    d_out_position[ridx] = new_position;
   }
 }
 
@@ -266,7 +264,8 @@ class RowPartitioner {
 
     std::size_t total_rows = 0;
     for (int i = 0; i < nidx.size(); i++) {
-      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)};
+      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment,
+                         op_data.at(i)};
       total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
     }
     dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
@@ -280,7 +279,7 @@ class RowPartitioner {
     // Partition the rows according to the operator
     SortPositionBatchUnstable(common::Span<const PerNodeData<OpDataT>>(
                                   d_batch_info.data().get(), d_batch_info.size()),
-                              dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
+                              dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),dh::ToSpan(d_counts),
                               total_rows, op, stream_);
     dh::safe_cuda(
         cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
@@ -318,10 +317,9 @@ class RowPartitioner {
    *           argument and return the new position for this training instance.
    * \param sampled A device lambda to inform the partitioner whether a row is sampled.
    */
-  template <typename FinalisePositionOpT, typename Sampledp>
+  template <typename FinalisePositionOpT>
   void FinalisePosition(
-                        common::Span<bst_node_t> d_out_position, FinalisePositionOpT op,
-                        Sampledp sampledp) {
+                        common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
@@ -351,11 +349,11 @@ class RowPartitioner {
   constexpr int kBlockSize = 256;
 
   // Value found by experimentation
-  const int kItemsThread = 12;
+  const int kItemsThread = 8;
   const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
   common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
   FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
-      dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op, sampledp);
+      dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
   }
 };
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 8dc8ff97b120..b0c9fbaab029 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -401,6 +401,7 @@ struct GPUHistMakerDevice {
           }
           return go_left;
         });
+    
   }
 
   // After tree update is finished, update the position of all training
@@ -490,13 +491,15 @@ struct GPUHistMakerDevice {
       return position;
     };  // NOLINT
 
-    auto is_sampled_op = [d_gpair] __device__(size_t ridx) {
-          // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
-          // the sampler compacts the gradient vector.
-          return d_gpair[ridx].GetHess() - .0f == 0.f;
-        };
+    auto d_out_position = p_out_position->DeviceSpan();
+    row_partitioner->FinalisePosition(d_out_position, new_position_op);
 
-    row_partitioner->FinalisePosition(p_out_position->DeviceSpan(), new_position_op, is_sampled_op);
+    dh::LaunchN(row_partitioner->GetRows().size(), [=] __device__(size_t idx) {
+      bst_node_t position = d_out_position[idx];
+      d_update_predictions[idx] = d_nodes[position].LeafValue();
+      bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
+      d_out_position[idx] = is_row_sampled ? ~position : position;
+    });
   }
 
   bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 92bccff35330..ccde627bd9ea 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -72,7 +72,8 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
                                 cudaMemcpyDefault, nullptr));
   SortPositionBatchUnstable(
       common::Span<const PerNodeData<int>>(d_batch_info.data().get(), d_batch_info.size()),
-      dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows, op, nullptr);
+      dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows,
+      op, nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (int i = 0; i < segments.size(); i++) {

From 43eb83e6c9cfe214d67ce8f22f6d2b7f14c65e3c Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 13 Jun 2022 02:38:08 -0700
Subject: [PATCH 52/64] Remove unstable

---
 src/tree/gpu_hist/row_partitioner.cuh         | 194 ++++++++++--------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  12 +-
 2 files changed, 111 insertions(+), 95 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index e5a4a1849ab2..ee9884a3bf36 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -17,8 +17,8 @@ namespace tree {
   /** \brief Used to demarcate a contiguous set of row indices associated with
  * some tree node. */
 struct Segment {
-  size_t begin{0};
-  size_t end{0};
+  uint32_t begin{0};
+  uint32_t end{0};
 
   Segment() = default;
 
@@ -36,11 +36,13 @@ struct PerNodeData {
   OpDataT data;
 };
 
-template <typename OpDataT>
-__device__ __forceinline__ void AssignBatch(const PerNodeData<OpDataT> *batch_info,
+__constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256];
+
+template <typename BatchIterT>
+__device__ __forceinline__ void AssignBatch(BatchIterT batch_info,
                             std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) {
-  std::size_t sum = 0;
-  for (int16_t i = 0; i < kMaxUpdatePositionBatchSize; i++) {
+  uint32_t sum = 0;
+  for (int i = 0; i < kMaxUpdatePositionBatchSize; i++) {
     if (sum + batch_info[i].segment.Size() > global_thread_idx) {
       *batch_idx = i;
       *item_idx = (global_thread_idx - sum) + batch_info[i].segment.begin;
@@ -50,36 +52,6 @@ __device__ __forceinline__ void AssignBatch(const PerNodeData<OpDataT> *batch_in
   }
 }
 
-__forceinline__ __device__ uint32_t __lanemask_lt() { return ((uint32_t)1 << cub::LaneId()) - 1; }
-
-__forceinline__ __device__ uint32_t AtomicIncrement(PartitionCountsT* d_counts, bool go_left,
-                                                int16_t batch_idx) {
-  int mask = __activemask();
-  int leader = __ffs(mask) - 1;
-  uint32_t prefix = __popc(mask & __lanemask_lt());
-  bool group_is_contiguous = __all_sync(mask, batch_idx == __shfl_sync(mask, batch_idx, leader));
-  // If all threads here are working on the same node
-  // we can do a more efficient reduction with warp intrinsics
-  if (group_is_contiguous) {
-    uint32_t ballot = __ballot_sync(mask, go_left);
-    uint32_t global_left_count = 0;
-    uint32_t global_right_count = 0;
-    if (prefix == 0) {
-      global_left_count = atomicAdd(&d_counts->first, __popc(ballot));
-      global_right_count = atomicAdd(&d_counts->second, __popc(mask) - __popc(ballot));
-    }
-    global_left_count = __shfl_sync(mask, global_left_count, leader);
-    global_right_count = __shfl_sync(mask, global_right_count, leader);
-    uint32_t local_left_count = __popc(ballot & __lanemask_lt());
-    uint32_t local_right_count = __popc(mask & __lanemask_lt()) - local_left_count;
-
-    return go_left ? global_left_count + local_left_count : global_right_count + local_right_count;
-
-  } else {
-    auto address = go_left ? &d_counts->first : &d_counts->second;
-    return atomicAdd(address, 1);
-  }
-}
 
 template <typename OpDataT>
 struct SharedStorage {
@@ -87,75 +59,122 @@ struct SharedStorage {
   // Collectively load from global memory into shared memory
   template <int kBlockSize>
   __device__ const PerNodeData<OpDataT>* BlockLoad(
-      const common::Span<const PerNodeData<OpDataT>> d_batch_info) {
-    for (int i = threadIdx.x; i < d_batch_info.size(); i += kBlockSize) {
-      data[i] = d_batch_info.data()[i];
+      const PerNodeData<OpDataT>* d_batch_info) {
+    for (int i = threadIdx.x; i < kMaxUpdatePositionBatchSize; i += kBlockSize) {
+      data[i] = d_batch_info[i];
     }
     __syncthreads();
     return data;
   }
 };
 
-template <int kBlockSize, typename RowIndexT, typename OpT,
-          typename OpDataT>
-__global__ __launch_bounds__(kBlockSize) void SortPositionBatchUnstableKernel(
-    const common::Span<const PerNodeData<OpDataT>> d_batch_info, common::Span<RowIndexT> d_ridx,
-    common::Span<RowIndexT> ridx_tmp, common::Span<PartitionCountsT> counts, OpT op,
-    std::size_t total_rows) {
-  // Initialise shared memory this way to avoid calling constructors
-  __shared__ cub::Uninitialized<SharedStorage<OpDataT>> shared;
-  auto batch_info = shared.Alias().BlockLoad<kBlockSize>(d_batch_info);
-
-  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows;
-       idx += blockDim.x * gridDim.x) {
-    int batch_idx;
-    std::size_t item_idx;
-    AssignBatch(batch_info, idx, &batch_idx, &item_idx);
-    auto ridx = d_ridx[item_idx];
-    auto op_res = op(ridx, batch_info[batch_idx].data);
-    auto current_num_items = AtomicIncrement(&counts.data()[batch_idx], op_res, batch_idx);
-    auto segment = batch_info[batch_idx].segment;
-    auto destination_address =
-        op_res ? segment.begin + current_num_items : segment.end - current_num_items - 1;
-    ridx_tmp[destination_address] = ridx;
-  }
-}
-
 template <int kBlockSize, typename RowIndexT, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel(
-    const common::Span<const PerNodeData<OpDataT>> d_batch_info, common::Span<RowIndexT> d_ridx,
-    common::Span<RowIndexT> ridx_tmp,
+    common::Span<RowIndexT> d_ridx, const common::Span<const RowIndexT> ridx_tmp,
     std::size_t total_rows) {
-      
+  // Load this into shared memory
+  // the compiler puts it into registers otherwise
+  // then we get spilling to local memory
+  const PerNodeData<OpDataT>* batch_info =
+      reinterpret_cast<const PerNodeData<OpDataT>*>(constant_memory);
   __shared__ cub::Uninitialized<SharedStorage<OpDataT>> shared;
-  auto batch_info = shared.Alias().BlockLoad<kBlockSize>(d_batch_info);
-  
+  auto s_batch_info = shared.Alias().BlockLoad<kBlockSize>(batch_info);
+
   for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows;
        idx += blockDim.x * gridDim.x) {
     int batch_idx;
     std::size_t item_idx;
-    AssignBatch(batch_info, idx,&batch_idx, &item_idx);
+    AssignBatch(s_batch_info, idx, &batch_idx, &item_idx);
     d_ridx[item_idx] = ridx_tmp[item_idx];
   }
 }
 
+// We can scan over this tuple, where the scan gives us information on how to partition inputs
+// according to the flag
+struct IndexFlagTuple {
+  bst_uint idx;            // The location of the item we are working on in ridx_
+  bst_uint flag_scan;      // This gets populated after scanning
+  int batch_idx;       // Which node in the batch does this item belong to
+  bool flag;               // Result of op (is this item going left?)
+};
+
+struct IndexFlagOp {
+  __device__ IndexFlagTuple operator()(const IndexFlagTuple& a, const IndexFlagTuple& b) const {
+    // Segmented scan - resets if we cross batch boundaries
+    if (a.batch_idx == b.batch_idx) {
+      // Accumulate the flags, everything else stays the same
+      return {b.idx, a.flag_scan + b.flag_scan, b.batch_idx, b.flag};
+    } else {
+      return b;
+    }
+  }
+};
+
+template <typename OpDataT>
+struct WriteResultsFunctor {
+  const bst_uint* ridx_in;
+  bst_uint* ridx_out;
+  PartitionCountsT *counts;
+
+  __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
+    std::size_t scatter_address;
+    const PerNodeData<OpDataT>* batch_info =
+        reinterpret_cast<const PerNodeData<OpDataT>*>(constant_memory);
+    const Segment& segment = batch_info[x.batch_idx].segment;
+    if (x.flag) {
+      bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan
+      scatter_address = segment.begin + num_previous_flagged;
+    } else {
+      bst_uint num_previous_unflagged = (x.idx - segment.begin) - x.flag_scan;
+      scatter_address = segment.end - num_previous_unflagged - 1;
+    }
+    ridx_out[scatter_address] = ridx_in[x.idx];
+
+    if (x.idx == (segment.end - 1)) {
+      // Write out counts
+      counts[x.batch_idx] = {x.flag_scan,0};
+    }
+
+    // Discard
+    return {};
+  }
+};
+
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatchUnstable(const common::Span<const PerNodeData<OpDataT>> batch_info,
-                               common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
-                               common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
-                               OpT op, cudaStream_t stream) {
-  CHECK_LE(batch_info.size(), kMaxUpdatePositionBatchSize);
+void SortPositionBatch(
+                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
+                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
+                       OpT op, cudaStream_t stream) {
+  WriteResultsFunctor<OpDataT> write_results{ridx.data(), ridx_tmp.data(), d_counts.data()};
+
+  auto discard_write_iterator =
+      thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
+  auto counting = thrust::make_counting_iterator(0llu);
+  auto input_iterator =
+      dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
+        const PerNodeData<OpDataT>* batch_info_itr =
+            reinterpret_cast<const PerNodeData<OpDataT>*>(constant_memory);
+        int batch_idx;
+        std::size_t item_idx;
+        AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
+        auto op_res = op(ridx[item_idx], batch_info_itr[batch_idx].data);
+        return IndexFlagTuple{bst_uint(item_idx), op_res, batch_idx, op_res};
+      });
+  size_t temp_bytes = 0;
+  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
+                                 IndexFlagOp(), total_rows, stream);
+  dh::TemporaryArray<int8_t> temp(temp_bytes);
+  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
+                                 discard_write_iterator, IndexFlagOp(), total_rows, stream);
+
   constexpr int kBlockSize = 256;
 
   // Value found by experimentation
   const int kItemsThread = 12;
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
-  SortPositionBatchUnstableKernel<kBlockSize>
-      <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp,d_counts, op, total_rows);
-
-  SortPositionCopyKernel<kBlockSize>
-      <<<grid_size, kBlockSize, 0, stream>>>(batch_info, ridx, ridx_tmp, total_rows);
+  SortPositionCopyKernel<kBlockSize,RowIndexT,OpDataT>
+      <<<grid_size, kBlockSize, 0, stream>>>(ridx, ridx_tmp,total_rows);
 }
 
 struct NodePositionInfo {
@@ -260,7 +279,6 @@ class RowPartitioner {
     CHECK_EQ(nidx.size(), op_data.size());
 
     auto h_batch_info = pinned2_.GetSpan<PerNodeData<OpDataT>>(nidx.size());
-    dh::TemporaryArray<PerNodeData<OpDataT>> d_batch_info(nidx.size());
 
     std::size_t total_rows = 0;
     for (int i = 0; i < nidx.size(); i++) {
@@ -268,17 +286,18 @@ class RowPartitioner {
                          op_data.at(i)};
       total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
     }
-    dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
-                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
-                                  cudaMemcpyDefault, stream_));
+    static_assert(sizeof(PerNodeData<OpDataT>) * kMaxUpdatePositionBatchSize <=
+                  sizeof(constant_memory));
+    dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(),
+                                          h_batch_info.size() * sizeof(PerNodeData<OpDataT>), 0,
+                                          cudaMemcpyDefault, stream_));
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<PartitionCountsT>(nidx.size(), PartitionCountsT{});
     dh::TemporaryArray<PartitionCountsT> d_counts(nidx.size(), PartitionCountsT{});
 
     // Partition the rows according to the operator
-    SortPositionBatchUnstable(common::Span<const PerNodeData<OpDataT>>(
-                                  d_batch_info.data().get(), d_batch_info.size()),
+    SortPositionBatch<RowIndexT,UpdatePositionOpT,OpDataT>(
                               dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),dh::ToSpan(d_counts),
                               total_rows, op, stream_);
     dh::safe_cuda(
@@ -346,9 +365,8 @@ class RowPartitioner {
       return position;
     };
 
-  constexpr int kBlockSize = 256;
+  constexpr int kBlockSize = 512;
 
-  // Value found by experimentation
   const int kItemsThread = 8;
   const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
   common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index ccde627bd9ea..0061fdb121d6 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -67,13 +67,11 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
     h_batch_info[i] = {segments.at(i), 0};
     total_rows += segments.at(i).Size();
   }
-  dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
-                                h_batch_info.size() * sizeof(PerNodeData<int>),
-                                cudaMemcpyDefault, nullptr));
-  SortPositionBatchUnstable(
-      common::Span<const PerNodeData<int>>(d_batch_info.data().get(), d_batch_info.size()),
-      dh::ToSpan(ridx), dh::ToSpan(ridx_tmp), dh::ToSpan(counts), total_rows,
-      op, nullptr);
+  dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(),
+                                        h_batch_info.size() * sizeof(PerNodeData<int>), 0,
+                                        cudaMemcpyDefault, nullptr));
+  SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(ridx), dh::ToSpan(ridx_tmp),
+                                                 dh::ToSpan(counts), total_rows, op, nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (int i = 0; i < segments.size(); i++) {

From 968bb29d3ef39115f674df812e94574cebec964d Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 14 Jun 2022 03:35:28 -0700
Subject: [PATCH 53/64] Format

---
 src/tree/gpu_hist/row_partitioner.cuh | 79 +++++++++++++--------------
 1 file changed, 37 insertions(+), 42 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index ee9884a3bf36..876cf2122920 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -2,19 +2,21 @@
  * Copyright 2017-2022 XGBoost contributors
  */
 #pragma once
+#include <thrust/execution_policy.h>
+
 #include <limits>
 #include <vector>
-#include "xgboost/base.h"
+
 #include "../../common/device_helpers.cuh"
+#include "xgboost/base.h"
 #include "xgboost/generic_parameters.h"
 #include "xgboost/task.h"
 #include "xgboost/tree_model.h"
-#include <thrust/execution_policy.h>
 
 namespace xgboost {
 namespace tree {
 
-  /** \brief Used to demarcate a contiguous set of row indices associated with
+/** \brief Used to demarcate a contiguous set of row indices associated with
  * some tree node. */
 struct Segment {
   uint32_t begin{0};
@@ -26,7 +28,7 @@ struct Segment {
   __host__ __device__ size_t Size() const { return end - begin; }
 };
 
-using PartitionCountsT = thrust::pair<bst_uint,bst_uint>;
+using PartitionCountsT = thrust::pair<bst_uint, bst_uint>;
 
 // TODO(Rory): Can be larger. To be tuned alongside other batch operations.
 static const int kMaxUpdatePositionBatchSize = 32;
@@ -39,8 +41,8 @@ struct PerNodeData {
 __constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256];
 
 template <typename BatchIterT>
-__device__ __forceinline__ void AssignBatch(BatchIterT batch_info,
-                            std::size_t global_thread_idx, int* batch_idx, std::size_t* item_idx) {
+__device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx,
+                                            int* batch_idx, std::size_t* item_idx) {
   uint32_t sum = 0;
   for (int i = 0; i < kMaxUpdatePositionBatchSize; i++) {
     if (sum + batch_info[i].segment.Size() > global_thread_idx) {
@@ -52,14 +54,12 @@ __device__ __forceinline__ void AssignBatch(BatchIterT batch_info,
   }
 }
 
-
 template <typename OpDataT>
 struct SharedStorage {
   PerNodeData<OpDataT> data[kMaxUpdatePositionBatchSize];
   // Collectively load from global memory into shared memory
   template <int kBlockSize>
-  __device__ const PerNodeData<OpDataT>* BlockLoad(
-      const PerNodeData<OpDataT>* d_batch_info) {
+  __device__ const PerNodeData<OpDataT>* BlockLoad(const PerNodeData<OpDataT>* d_batch_info) {
     for (int i = threadIdx.x; i < kMaxUpdatePositionBatchSize; i += kBlockSize) {
       data[i] = d_batch_info[i];
     }
@@ -92,10 +92,10 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel(
 // We can scan over this tuple, where the scan gives us information on how to partition inputs
 // according to the flag
 struct IndexFlagTuple {
-  bst_uint idx;            // The location of the item we are working on in ridx_
-  bst_uint flag_scan;      // This gets populated after scanning
+  bst_uint idx;        // The location of the item we are working on in ridx_
+  bst_uint flag_scan;  // This gets populated after scanning
   int batch_idx;       // Which node in the batch does this item belong to
-  bool flag;               // Result of op (is this item going left?)
+  bool flag;           // Result of op (is this item going left?)
 };
 
 struct IndexFlagOp {
@@ -114,7 +114,7 @@ template <typename OpDataT>
 struct WriteResultsFunctor {
   const bst_uint* ridx_in;
   bst_uint* ridx_out;
-  PartitionCountsT *counts;
+  PartitionCountsT* counts;
 
   __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
     std::size_t scatter_address;
@@ -122,7 +122,7 @@ struct WriteResultsFunctor {
         reinterpret_cast<const PerNodeData<OpDataT>*>(constant_memory);
     const Segment& segment = batch_info[x.batch_idx].segment;
     if (x.flag) {
-      bst_uint num_previous_flagged = x.flag_scan - 1; // -1 because inclusive scan
+      bst_uint num_previous_flagged = x.flag_scan - 1;  // -1 because inclusive scan
       scatter_address = segment.begin + num_previous_flagged;
     } else {
       bst_uint num_previous_unflagged = (x.idx - segment.begin) - x.flag_scan;
@@ -132,7 +132,7 @@ struct WriteResultsFunctor {
 
     if (x.idx == (segment.end - 1)) {
       // Write out counts
-      counts[x.batch_idx] = {x.flag_scan,0};
+      counts[x.batch_idx] = {x.flag_scan, 0};
     }
 
     // Discard
@@ -141,10 +141,9 @@ struct WriteResultsFunctor {
 };
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatch(
-                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
-                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows,
-                       OpT op, cudaStream_t stream) {
+void SortPositionBatch(common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
+                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows, OpT op,
+                       cudaStream_t stream) {
   WriteResultsFunctor<OpDataT> write_results{ridx.data(), ridx_tmp.data(), d_counts.data()};
 
   auto discard_write_iterator =
@@ -173,8 +172,8 @@ void SortPositionBatch(
   const int kItemsThread = 12;
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
-  SortPositionCopyKernel<kBlockSize,RowIndexT,OpDataT>
-      <<<grid_size, kBlockSize, 0, stream>>>(ridx, ridx_tmp,total_rows);
+  SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
+      <<<grid_size, kBlockSize, 0, stream>>>(ridx, ridx_tmp, total_rows);
 }
 
 struct NodePositionInfo {
@@ -184,7 +183,8 @@ struct NodePositionInfo {
   __device__ bool IsLeaf() { return left_child == -1; }
 };
 
-__device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const NodePositionInfo* d_node_info) {
+__device__ __forceinline__ int GetPositionFromSegments(std::size_t idx,
+                                                       const NodePositionInfo* d_node_info) {
   int position = 0;
   NodePositionInfo node = d_node_info[position];
   while (!node.IsLeaf()) {
@@ -206,7 +206,7 @@ __device__ __forceinline__ int GetPositionFromSegments(std::size_t idx, const No
 template <int kBlockSize, typename RowIndexT, typename OpT>
 __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel(
     const common::Span<const NodePositionInfo> d_node_info,
-    const common::Span<const RowIndexT> d_ridx,common::Span<bst_node_t> d_out_position, OpT op) {
+    const common::Span<const RowIndexT> d_ridx, common::Span<bst_node_t> d_out_position, OpT op) {
   for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size();
        idx += blockDim.x * gridDim.x) {
     auto position = GetPositionFromSegments(idx, d_node_info.data());
@@ -222,7 +222,6 @@ class RowPartitioner {
  public:
   using RowIndexT = bst_uint;
 
-
  private:
   int device_idx_;
   /*! \brief In here if you want to find the rows belong to a node nid, first you need to
@@ -282,8 +281,7 @@ class RowPartitioner {
 
     std::size_t total_rows = 0;
     for (int i = 0; i < nidx.size(); i++) {
-      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment,
-                         op_data.at(i)};
+      h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)};
       total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
     }
     static_assert(sizeof(PerNodeData<OpDataT>) * kMaxUpdatePositionBatchSize <=
@@ -297,13 +295,11 @@ class RowPartitioner {
     dh::TemporaryArray<PartitionCountsT> d_counts(nidx.size(), PartitionCountsT{});
 
     // Partition the rows according to the operator
-    SortPositionBatch<RowIndexT,UpdatePositionOpT,OpDataT>(
-                              dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_),dh::ToSpan(d_counts),
-                              total_rows, op, stream_);
-    dh::safe_cuda(
-        cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
-                        sizeof(decltype(d_counts)::value_type) * d_counts.size(),
-                        cudaMemcpyDefault, stream_));
+    SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
+        dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, stream_);
+    dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
+                                  sizeof(decltype(d_counts)::value_type) * d_counts.size(),
+                                  cudaMemcpyDefault, stream_));
 
     dh::safe_cuda(cudaStreamSynchronize(stream_));
 
@@ -323,7 +319,7 @@ class RowPartitioner {
     }
   }
 
-   /**
+  /**
    * \brief Finalise the position of all training instances after tree construction is
    * complete. Does not update any other meta information in this data structure, so
    * should only be used at the end of training.
@@ -337,8 +333,7 @@ class RowPartitioner {
    * \param sampled A device lambda to inform the partitioner whether a row is sampled.
    */
   template <typename FinalisePositionOpT>
-  void FinalisePosition(
-                        common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
+  void FinalisePosition(common::Span<bst_node_t> d_out_position, FinalisePositionOpT op) {
     dh::TemporaryArray<NodePositionInfo> d_node_info_storage(ridx_segments_.size());
     dh::safe_cuda(cudaMemcpyAsync(d_node_info_storage.data().get(), ridx_segments_.data(),
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
@@ -365,13 +360,13 @@ class RowPartitioner {
       return position;
     };
 
-  constexpr int kBlockSize = 512;
+    constexpr int kBlockSize = 512;
 
-  const int kItemsThread = 8;
-  const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
-  common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
-  FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
-      dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
+    const int kItemsThread = 8;
+    const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
+    common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
+    FinalisePositionKernel<kBlockSize><<<grid_size, kBlockSize, 0, stream_>>>(
+        dh::ToSpan(d_node_info_storage), d_ridx, d_out_position, op);
   }
 };
 

From 1372ad856ba80b5b558ae3850406f64b211bd453 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 15 Jun 2022 08:33:31 -0700
Subject: [PATCH 54/64] Review comments

---
 cmake/Utils.cmake                             |  1 -
 src/tree/gpu_hist/row_partitioner.cu          | 15 ++--
 src/tree/gpu_hist/row_partitioner.cuh         | 72 ++++++++-----------
 src/tree/updater_gpu_hist.cu                  | 15 ++--
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 10 +--
 5 files changed, 47 insertions(+), 66 deletions(-)

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
index 10f0c8104a07..cbc11feb49b6 100644
--- a/cmake/Utils.cmake
+++ b/cmake/Utils.cmake
@@ -136,7 +136,6 @@ function(xgboost_set_cuda_flags target)
   target_compile_options(${target} PRIVATE
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-extended-lambda>
     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>
-    $<$<COMPILE_LANGUAGE:CUDA>:-lineinfo>
     $<$<COMPILE_LANGUAGE:CUDA>:${GEN_CODE}>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=${OpenMP_CXX_FLAGS}>
     $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all>)
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 53b6039dabd3..015d817f3640 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -1,10 +1,12 @@
 /*!
- * Copyright 2017-2021 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
  */
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/sequence.h>
+
 #include <vector>
+
 #include "../../common/device_helpers.cuh"
 #include "row_partitioner.cuh"
 
@@ -24,14 +26,8 @@ RowPartitioner::~RowPartitioner() {
   dh::safe_cuda(cudaStreamDestroy(stream_));
 }
 
-common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
-    bst_node_t nidx) {
+common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
   auto segment = ridx_segments_.at(nidx).segment;
-  // Return empty span here as a valid result
-  // Will error if we try to construct a span from a pointer with size 0
-  if (segment.Size() == 0) {
-    return {};
-  }
   return dh::ToSpan(ridx_).subspan(segment.begin, segment.Size());
 }
 
@@ -39,8 +35,7 @@ common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows() {
   return dh::ToSpan(ridx_);
 }
 
-std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(
-    bst_node_t nidx) {
+std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(bst_node_t nidx) {
   auto span = GetRows(nidx);
   std::vector<RowIndexT> rows(span.size());
   dh::CopyDeviceSpanToVector(&rows, span);
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 876cf2122920..d27b4aa65551 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -19,17 +19,15 @@ namespace tree {
 /** \brief Used to demarcate a contiguous set of row indices associated with
  * some tree node. */
 struct Segment {
-  uint32_t begin{0};
-  uint32_t end{0};
+  bst_uint begin{0};
+  bst_uint end{0};
 
   Segment() = default;
 
-  Segment(size_t begin, size_t end) : begin(begin), end(end) { CHECK_GE(end, begin); }
+  Segment(bst_uint begin, bst_uint end) : begin(begin), end(end) { CHECK_GE(end, begin); }
   __host__ __device__ size_t Size() const { return end - begin; }
 };
 
-using PartitionCountsT = thrust::pair<bst_uint, bst_uint>;
-
 // TODO(Rory): Can be larger. To be tuned alongside other batch operations.
 static const int kMaxUpdatePositionBatchSize = 32;
 template <typename OpDataT>
@@ -43,7 +41,7 @@ __constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256];
 template <typename BatchIterT>
 __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx,
                                             int* batch_idx, std::size_t* item_idx) {
-  uint32_t sum = 0;
+  bst_uint sum = 0;
   for (int i = 0; i < kMaxUpdatePositionBatchSize; i++) {
     if (sum + batch_info[i].segment.Size() > global_thread_idx) {
       *batch_idx = i;
@@ -80,8 +78,7 @@ __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel(
   __shared__ cub::Uninitialized<SharedStorage<OpDataT>> shared;
   auto s_batch_info = shared.Alias().BlockLoad<kBlockSize>(batch_info);
 
-  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < total_rows;
-       idx += blockDim.x * gridDim.x) {
+  for (auto idx : dh::GridStrideRange<std::size_t>(0, total_rows)) {
     int batch_idx;
     std::size_t item_idx;
     AssignBatch(s_batch_info, idx, &batch_idx, &item_idx);
@@ -114,7 +111,7 @@ template <typename OpDataT>
 struct WriteResultsFunctor {
   const bst_uint* ridx_in;
   bst_uint* ridx_out;
-  PartitionCountsT* counts;
+  bst_uint* counts;
 
   __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
     std::size_t scatter_address;
@@ -132,7 +129,7 @@ struct WriteResultsFunctor {
 
     if (x.idx == (segment.end - 1)) {
       // Write out counts
-      counts[x.batch_idx] = {x.flag_scan, 0};
+      counts[x.batch_idx] = x.flag_scan;
     }
 
     // Discard
@@ -142,7 +139,7 @@ struct WriteResultsFunctor {
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatch(common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
-                       common::Span<PartitionCountsT> d_counts, std::size_t total_rows, OpT op,
+                       common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
                        cudaStream_t stream) {
   WriteResultsFunctor<OpDataT> write_results{ridx.data(), ridx_tmp.data(), d_counts.data()};
 
@@ -178,8 +175,8 @@ void SortPositionBatch(common::Span<RowIndexT> ridx, common::Span<RowIndexT> rid
 
 struct NodePositionInfo {
   Segment segment;
-  int left_child = -1;
-  int right_child = -1;
+  bst_node_t left_child = -1;
+  bst_node_t right_child = -1;
   __device__ bool IsLeaf() { return left_child == -1; }
 };
 
@@ -207,8 +204,7 @@ template <int kBlockSize, typename RowIndexT, typename OpT>
 __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel(
     const common::Span<const NodePositionInfo> d_node_info,
     const common::Span<const RowIndexT> d_ridx, common::Span<bst_node_t> d_out_position, OpT op) {
-  for (std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < d_ridx.size();
-       idx += blockDim.x * gridDim.x) {
+  for (auto idx : dh::GridStrideRange<std::size_t>(0, d_ridx.size())) {
     auto position = GetPositionFromSegments(idx, d_node_info.data());
     RowIndexT ridx = d_ridx[idx];
     bst_node_t new_position = op(ridx, position);
@@ -221,6 +217,7 @@ __global__ __launch_bounds__(kBlockSize) void FinalisePositionKernel(
 class RowPartitioner {
  public:
   using RowIndexT = bst_uint;
+  static constexpr bst_node_t kIgnoredTreePosition = -1;
 
  private:
   int device_idx_;
@@ -267,6 +264,20 @@ class RowPartitioner {
    */
   std::vector<RowIndexT> GetRowsHost(bst_node_t nidx);
 
+  /**
+   * \brief Updates the tree position for set of training instances being split
+   * into left and right child nodes. Accepts a user-defined lambda specifying
+   * which branch each training instance should go down.
+   *
+   * \tparam  UpdatePositionOpT
+   * \tparam  OpDataT
+   * \param nidx        The index of the nodes being split.
+   * \param left_nidx   The left child indices.
+   * \param right_nidx  The right child indices.
+   * \param op_data     User-defined data provided as the second argument to op
+   * \param op          Device lambda with the row index as the first argument and op_data as the
+   * second. Returns true if this training instance goes on the left partition.
+   */
   template <typename UpdatePositionOpT, typename OpDataT>
   void UpdatePositionBatch(const std::vector<bst_node_t>& nidx,
                            const std::vector<bst_node_t>& left_nidx,
@@ -291,8 +302,8 @@ class RowPartitioner {
                                           cudaMemcpyDefault, stream_));
 
     // Temporary arrays
-    auto h_counts = pinned_.GetSpan<PartitionCountsT>(nidx.size(), PartitionCountsT{});
-    dh::TemporaryArray<PartitionCountsT> d_counts(nidx.size(), PartitionCountsT{});
+    auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
+    dh::TemporaryArray<bst_uint> d_counts(nidx.size(), 0);
 
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
@@ -300,13 +311,14 @@ class RowPartitioner {
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
                                   sizeof(decltype(d_counts)::value_type) * d_counts.size(),
                                   cudaMemcpyDefault, stream_));
-
+    // TODO(Rory): this synchronisation hurts performance a lot
+    // Future optimisation should find a way to skip this
     dh::safe_cuda(cudaStreamSynchronize(stream_));
 
     // Update segments
     for (int i = 0; i < nidx.size(); i++) {
       auto segment = ridx_segments_.at(nidx[i]).segment;
-      auto left_count = h_counts[i].first;
+      auto left_count = h_counts[i];
       CHECK_LE(left_count, segment.Size());
       CHECK_GE(left_count, 0);
       ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
@@ -339,29 +351,7 @@ class RowPartitioner {
                                   sizeof(NodePositionInfo) * ridx_segments_.size(),
                                   cudaMemcpyDefault, stream_));
 
-    auto d_node_info = d_node_info_storage.data().get();
-
-    auto current_position = [=] __device__(std::size_t idx) {
-      int position = 0;
-      NodePositionInfo node = d_node_info[position];
-      while (!node.IsLeaf()) {
-        NodePositionInfo left = d_node_info[node.left_child];
-        NodePositionInfo right = d_node_info[node.right_child];
-        if (idx >= left.segment.begin && idx < left.segment.end) {
-          position = node.left_child;
-          node = left;
-        } else if (idx >= right.segment.begin && idx < right.segment.end) {
-          position = node.right_child;
-          node = right;
-        } else {
-          KERNEL_CHECK(false);
-        }
-      }
-      return position;
-    };
-
     constexpr int kBlockSize = 512;
-
     const int kItemsThread = 8;
     const int grid_size = xgboost::common::DivRoundUp(ridx_.size(), kBlockSize * kItemsThread);
     common::Span<const RowIndexT> d_ridx(ridx_.data().get(), ridx_.size());
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index e461ae4f362e..e41c1d31514a 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -378,7 +378,7 @@ struct GPUHistMakerDevice {
       nidx.at(i) = e.nid;
       left_nidx.at(i) = split_node.LeftChild();
       right_nidx.at(i) = split_node.RightChild();
-      split_data.at(i) = NodeSplitData{ split_node, split_type, e.split.split_cats };
+      split_data.at(i) = NodeSplitData{split_node, split_type, e.split.split_cats};
     }
 
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
@@ -401,7 +401,6 @@ struct GPUHistMakerDevice {
           }
           return go_left;
         });
-    
   }
 
   // After tree update is finished, update the position of all training
@@ -459,7 +458,7 @@ struct GPUHistMakerDevice {
     auto new_position_op = [=] __device__(size_t row_id, int position) {
       // What happens if user prune the tree?
       if (!d_matrix.IsInRange(row_id)) {
-        return -1;
+        return RowPartitioner::kIgnoredTreePosition;
       }
       auto node = d_nodes[position];
 
@@ -483,7 +482,7 @@ struct GPUHistMakerDevice {
             position = node.RightChild();
           }
         }
-        
+
         node = d_nodes[position];
       }
 
@@ -502,17 +501,15 @@ struct GPUHistMakerDevice {
     });
   }
 
-  bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
     CHECK(p_tree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
     auto d_update_predictions = dh::ToSpan(update_predictions);
-    if (d_update_predictions.empty()) return false;
     CHECK_EQ(out_preds_d.Size(), d_update_predictions.size());
     dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
       out_preds_d(idx) += d_update_predictions[idx];
     });
-    return true;
   }
 
   // num histograms is the number of contiguous histograms in memory to reduce over
@@ -844,9 +841,9 @@ class GPUHistMaker : public TreeUpdater {
       return false;
     }
     monitor_.Start("UpdatePredictionCache");
-    auto result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
+    maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
     monitor_.Stop("UpdatePredictionCache");
-    return result;
+    return true;
   }
 
   TrainParam param_;  // NOLINT
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 0061fdb121d6..8ad85779cc77 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -55,7 +55,7 @@ TEST(RowPartitioner, Batch) { TestUpdatePositionBatch(); }
 void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Segment>& segments) {
   thrust::device_vector<uint32_t> ridx = ridx_in;
   thrust::device_vector<uint32_t> ridx_tmp(ridx_in.size());
-  thrust::device_vector<PartitionCountsT> counts(segments.size());
+  thrust::device_vector<bst_uint> counts(segments.size());
 
   auto op = [=] __device__(auto ridx, int data) { return ridx % 2 == 0; };
   std::vector<int> op_data(segments.size());
@@ -77,12 +77,12 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   for (int i = 0; i < segments.size(); i++) {
     auto begin = ridx.begin() + segments[i].begin;
     auto end = ridx.begin() + segments[i].end;
-    PartitionCountsT count = counts[i];
+    bst_uint count = counts[i];
     auto left_partition_count =
-        thrust::count_if(thrust::device, begin, begin + count.first, op_without_data);
-    EXPECT_EQ(left_partition_count, count.first);
+        thrust::count_if(thrust::device, begin, begin + count, op_without_data);
+    EXPECT_EQ(left_partition_count, count);
     auto right_partition_count =
-        thrust::count_if(thrust::device, begin + count.first, end, op_without_data);
+        thrust::count_if(thrust::device, begin + count, end, op_without_data);
     EXPECT_EQ(right_partition_count, 0);
   }
 }

From a910fb911f68f0fc77937741a83495a551292764 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 17 Jun 2022 02:59:54 -0700
Subject: [PATCH 55/64] Reintroduce prediction caching for external memory.

---
 src/tree/updater_gpu_hist.cu | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index e41c1d31514a..a81129c64120 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -413,11 +413,6 @@ struct GPUHistMakerDevice {
       LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
     }
 
-    // External memory will not use prediction cache
-    if (!p_fmat->SingleColBlock()) {
-      return;
-    }
-
     dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
     dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                   d_nodes.size() * sizeof(RegTree::Node),
@@ -436,9 +431,25 @@ struct GPUHistMakerDevice {
       dh::CopyToD(categories_segments, &d_categories_segments);
     }
 
-    FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                           dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
-                           p_out_position);
+    if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) {
+      row_partitioner.reset();  // Release the device memory first before reallocating
+      row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_));
+    }
+    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
+      // see comment in the `FinalisePositionInPage`.
+      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
+    }
+    if (page->n_rows == p_fmat->Info().num_row_) {
+      FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                             dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
+                             p_out_position);
+    } else {
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
+        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                               dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
+                               p_out_position);
+      }
+    }
   }
 
   void FinalisePositionInPage(EllpackPageImpl const *page,
@@ -446,7 +457,6 @@ struct GPUHistMakerDevice {
                               common::Span<FeatureType const> d_feature_types,
                               common::Span<uint32_t const> categories,
                               common::Span<RegTree::Segment> categories_segments,
-                              ObjInfo task,
                               HostDeviceVector<bst_node_t>* p_out_position) {
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
     auto d_gpair = this->gpair;

From ff05df532b82c0ddf08e338efa43d7aeb89feea5 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Sun, 19 Jun 2022 06:09:05 -0700
Subject: [PATCH 56/64] Avoid initialising temp memory

---
 src/tree/gpu_hist/row_partitioner.cu          |  2 +-
 src/tree/gpu_hist/row_partitioner.cuh         | 21 ++++++++++++-------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  3 ++-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 015d817f3640..87d3ad62e003 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -14,7 +14,7 @@ namespace xgboost {
 namespace tree {
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
+    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows),d_counts(kMaxUpdatePositionBatchSize) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index d27b4aa65551..a9cca72f4ce1 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -140,7 +140,7 @@ struct WriteResultsFunctor {
 template <typename RowIndexT, typename OpT, typename OpDataT>
 void SortPositionBatch(common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
                        common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
-                       cudaStream_t stream) {
+                       dh::device_vector<int8_t>* tmp, cudaStream_t stream) {
   WriteResultsFunctor<OpDataT> write_results{ridx.data(), ridx_tmp.data(), d_counts.data()};
 
   auto discard_write_iterator =
@@ -157,10 +157,13 @@ void SortPositionBatch(common::Span<RowIndexT> ridx, common::Span<RowIndexT> rid
         return IndexFlagTuple{bst_uint(item_idx), op_res, batch_idx, op_res};
       });
   size_t temp_bytes = 0;
-  cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
-                                 IndexFlagOp(), total_rows, stream);
-  dh::TemporaryArray<int8_t> temp(temp_bytes);
-  cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
+  if (tmp->empty()) {
+    cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator, discard_write_iterator,
+                                   IndexFlagOp(), total_rows, stream);
+    tmp->resize(temp_bytes);
+  }
+  temp_bytes = tmp->size();
+  cub::DeviceScan::InclusiveScan(tmp->data().get(), temp_bytes, input_iterator,
                                  discard_write_iterator, IndexFlagOp(), total_rows, stream);
 
   constexpr int kBlockSize = 256;
@@ -239,6 +242,8 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
+  dh::TemporaryArray<bst_uint> d_counts;
+  dh::device_vector<int8_t> tmp;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
   cudaStream_t stream_;
@@ -303,13 +308,13 @@ class RowPartitioner {
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
-    dh::TemporaryArray<bst_uint> d_counts(nidx.size(), 0);
 
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
-        dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, stream_);
+        dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, &tmp,
+        stream_);
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
-                                  sizeof(decltype(d_counts)::value_type) * d_counts.size(),
+                                  sizeof(decltype(d_counts)::value_type) * h_counts.size(),
                                   cudaMemcpyDefault, stream_));
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 8ad85779cc77..d35178c643c3 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -70,8 +70,9 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
   dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(),
                                         h_batch_info.size() * sizeof(PerNodeData<int>), 0,
                                         cudaMemcpyDefault, nullptr));
+  dh::device_vector<int8_t> tmp;
   SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(ridx), dh::ToSpan(ridx_tmp),
-                                                 dh::ToSpan(counts), total_rows, op, nullptr);
+                                                 dh::ToSpan(counts), total_rows, op, &tmp,nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (int i = 0; i < segments.size(); i++) {

From 0280b8c21284344af83dd1207f911c6c7f1f0a98 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 23 Jun 2022 05:48:24 -0700
Subject: [PATCH 57/64] Lint

---
 src/tree/gpu_hist/row_partitioner.cu | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 87d3ad62e003..46cb67003962 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -14,7 +14,10 @@ namespace xgboost {
 namespace tree {
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows),d_counts(kMaxUpdatePositionBatchSize) {
+    : device_idx_(device_idx),
+      ridx_(num_rows),
+      ridx_tmp_(num_rows),
+      d_counts(kMaxUpdatePositionBatchSize) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());

From 9c642dcaf8826ba3d8bc0f548fa3f6312d043960 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Fri, 24 Jun 2022 03:17:31 -0700
Subject: [PATCH 58/64] Review comments.

---
 src/tree/gpu_hist/row_partitioner.cu  |  2 +-
 src/tree/gpu_hist/row_partitioner.cuh | 12 ++++++------
 src/tree/updater_gpu_hist.cu          |  2 ++
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 46cb67003962..540c07a6fe64 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -17,7 +17,7 @@ RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
     : device_idx_(device_idx),
       ridx_(num_rows),
       ridx_tmp_(num_rows),
-      d_counts(kMaxUpdatePositionBatchSize) {
+      d_counts_(kMaxUpdatePositionBatchSize) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index a9cca72f4ce1..3a42f9245a63 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -242,8 +242,8 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
-  dh::TemporaryArray<bst_uint> d_counts;
-  dh::device_vector<int8_t> tmp;
+  dh::TemporaryArray<bst_uint> d_counts_;
+  dh::device_vector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
   cudaStream_t stream_;
@@ -301,7 +301,7 @@ class RowPartitioner {
       total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
     }
     static_assert(sizeof(PerNodeData<OpDataT>) * kMaxUpdatePositionBatchSize <=
-                  sizeof(constant_memory));
+                  sizeof(constant_memory),"Not enough constant memory allocated.") ;
     dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(),
                                           h_batch_info.size() * sizeof(PerNodeData<OpDataT>), 0,
                                           cudaMemcpyDefault, stream_));
@@ -311,10 +311,10 @@ class RowPartitioner {
 
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
-        dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts), total_rows, op, &tmp,
+        dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_), total_rows, op, &tmp_,
         stream_);
-    dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(),
-                                  sizeof(decltype(d_counts)::value_type) * h_counts.size(),
+    dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts_.data().get(),
+                                  sizeof(decltype(d_counts_)::value_type) * h_counts.size(),
                                   cudaMemcpyDefault, stream_));
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index a81129c64120..aa65c16bae1d 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -507,6 +507,8 @@ struct GPUHistMakerDevice {
       bst_node_t position = d_out_position[idx];
       d_update_predictions[idx] = d_nodes[position].LeafValue();
       bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
+      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
+      // the sampler compacts the gradient vector.
       d_out_position[idx] = is_row_sampled ? ~position : position;
     });
   }

From b4f2128fc59fa320cc62646e90dd5b58103b5a07 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Mon, 27 Jun 2022 05:20:23 -0700
Subject: [PATCH 59/64] Remove external memory prediction caching.

---
 src/tree/updater_gpu_hist.cu | 42 +++++++++++++-----------------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index aa65c16bae1d..7074359131df 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -408,9 +408,11 @@ struct GPUHistMakerDevice {
   // prediction cache
   void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task,
                         HostDeviceVector<bst_node_t>* p_out_position) {
-    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
-      // see comment in the `FinalisePositionInPage`.
-      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
+    // Prediction cache will not be used with external memory
+    if (!p_fmat->SingleColBlock()) {
+      p_out_position->Resize(0);
+      update_predictions.clear();
+      return;
     }
 
     dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
@@ -431,25 +433,9 @@ struct GPUHistMakerDevice {
       dh::CopyToD(categories_segments, &d_categories_segments);
     }
 
-    if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) {
-      row_partitioner.reset();  // Release the device memory first before reallocating
-      row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_));
-    }
-    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
-      // see comment in the `FinalisePositionInPage`.
-      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
-    }
-    if (page->n_rows == p_fmat->Info().num_row_) {
-      FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                             dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
-                             p_out_position);
-    } else {
-      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
-        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
-                               dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
-                               p_out_position);
-      }
-    }
+    FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                           dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments),
+                           p_out_position);
   }
 
   void FinalisePositionInPage(EllpackPageImpl const *page,
@@ -507,13 +493,14 @@ struct GPUHistMakerDevice {
       bst_node_t position = d_out_position[idx];
       d_update_predictions[idx] = d_nodes[position].LeafValue();
       bool is_row_sampled = d_gpair[idx].GetHess() - .0f == 0.f;
-      // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
-      // the sampler compacts the gradient vector.
       d_out_position[idx] = is_row_sampled ? ~position : position;
     });
   }
 
-  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+  bool UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+    if (update_predictions.empty()) {
+      return false;
+    }
     CHECK(p_tree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
@@ -522,6 +509,7 @@ struct GPUHistMakerDevice {
     dh::LaunchN(out_preds_d.Size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
       out_preds_d(idx) += d_update_predictions[idx];
     });
+    return true;
   }
 
   // num histograms is the number of contiguous histograms in memory to reduce over
@@ -853,9 +841,9 @@ class GPUHistMaker : public TreeUpdater {
       return false;
     }
     monitor_.Start("UpdatePredictionCache");
-    maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
+    bool result = maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
     monitor_.Stop("UpdatePredictionCache");
-    return true;
+    return result;
   }
 
   TrainParam param_;  // NOLINT

From 776ef9fb807dfc568fef0bdec561d7753886464b Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 28 Jun 2022 05:25:56 -0700
Subject: [PATCH 60/64] Remove constant memory in favour of __ldg().

---
 src/common/device_helpers.cuh                 | 20 +++++++
 src/tree/gpu_hist/row_partitioner.cuh         | 57 ++++++-------------
 .../cpp/tree/gpu_hist/test_row_partitioner.cu | 11 ++--
 3 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 123dc14e57be..33989a230464 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1939,4 +1939,24 @@ class CUDAStream {
   CUDAStreamView View() const { return CUDAStreamView{stream_}; }
   void Sync() { this->View().Sync(); }
 };
+
+// Force nvcc to load data as constant
+template <typename T>
+class LDGIterator {
+  typedef typename cub::UnitWord<T>::DeviceWord DeviceWordT;
+  static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT);
+
+  const T* ptr;
+
+ public:
+  LDGIterator(const T* ptr) : ptr(ptr) {}
+  __device__ T operator[](std::size_t idx) const {
+    DeviceWordT tmp[kNumWords];
+#pragma unroll
+    for (int i = 0; i < kNumWords; i++) {
+      tmp[i] = __ldg(reinterpret_cast<const DeviceWordT*>(ptr + idx) + i);
+    }
+    return *reinterpret_cast<const T*>(tmp);
+  }
+};
 }  // namespace dh
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 3a42f9245a63..e9fb7e86add7 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -36,8 +36,6 @@ struct PerNodeData {
   OpDataT data;
 };
 
-__constant__ char constant_memory[kMaxUpdatePositionBatchSize * 256];
-
 template <typename BatchIterT>
 __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t global_thread_idx,
                                             int* batch_idx, std::size_t* item_idx) {
@@ -52,36 +50,14 @@ __device__ __forceinline__ void AssignBatch(BatchIterT batch_info, std::size_t g
   }
 }
 
-template <typename OpDataT>
-struct SharedStorage {
-  PerNodeData<OpDataT> data[kMaxUpdatePositionBatchSize];
-  // Collectively load from global memory into shared memory
-  template <int kBlockSize>
-  __device__ const PerNodeData<OpDataT>* BlockLoad(const PerNodeData<OpDataT>* d_batch_info) {
-    for (int i = threadIdx.x; i < kMaxUpdatePositionBatchSize; i += kBlockSize) {
-      data[i] = d_batch_info[i];
-    }
-    __syncthreads();
-    return data;
-  }
-};
-
 template <int kBlockSize, typename RowIndexT, typename OpDataT>
 __global__ __launch_bounds__(kBlockSize) void SortPositionCopyKernel(
-    common::Span<RowIndexT> d_ridx, const common::Span<const RowIndexT> ridx_tmp,
-    std::size_t total_rows) {
-  // Load this into shared memory
-  // the compiler puts it into registers otherwise
-  // then we get spilling to local memory
-  const PerNodeData<OpDataT>* batch_info =
-      reinterpret_cast<const PerNodeData<OpDataT>*>(constant_memory);
-  __shared__ cub::Uninitialized<SharedStorage<OpDataT>> shared;
-  auto s_batch_info = shared.Alias().BlockLoad<kBlockSize>(batch_info);
-
+    dh::LDGIterator<PerNodeData<OpDataT>> batch_info, common::Span<RowIndexT> d_ridx,
+    const common::Span<const RowIndexT> ridx_tmp, std::size_t total_rows) {
   for (auto idx : dh::GridStrideRange<std::size_t>(0, total_rows)) {
     int batch_idx;
     std::size_t item_idx;
-    AssignBatch(s_batch_info, idx, &batch_idx, &item_idx);
+    AssignBatch(batch_info, idx, &batch_idx, &item_idx);
     d_ridx[item_idx] = ridx_tmp[item_idx];
   }
 }
@@ -109,14 +85,13 @@ struct IndexFlagOp {
 
 template <typename OpDataT>
 struct WriteResultsFunctor {
+  dh::LDGIterator<PerNodeData<OpDataT>> batch_info;
   const bst_uint* ridx_in;
   bst_uint* ridx_out;
   bst_uint* counts;
 
   __device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
     std::size_t scatter_address;
-    const PerNodeData<OpDataT>* batch_info =
-        reinterpret_cast<const PerNodeData<OpDataT>*>(constant_memory);
     const Segment& segment = batch_info[x.batch_idx].segment;
     if (x.flag) {
       bst_uint num_previous_flagged = x.flag_scan - 1;  // -1 because inclusive scan
@@ -138,18 +113,19 @@ struct WriteResultsFunctor {
 };
 
 template <typename RowIndexT, typename OpT, typename OpDataT>
-void SortPositionBatch(common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
+void SortPositionBatch(common::Span<const PerNodeData<OpDataT>> d_batch_info,
+                       common::Span<RowIndexT> ridx, common::Span<RowIndexT> ridx_tmp,
                        common::Span<bst_uint> d_counts, std::size_t total_rows, OpT op,
                        dh::device_vector<int8_t>* tmp, cudaStream_t stream) {
-  WriteResultsFunctor<OpDataT> write_results{ridx.data(), ridx_tmp.data(), d_counts.data()};
+  dh::LDGIterator<PerNodeData<OpDataT>> batch_info_itr(d_batch_info.data());
+  WriteResultsFunctor<OpDataT> write_results{batch_info_itr, ridx.data(), ridx_tmp.data(),
+                                             d_counts.data()};
 
   auto discard_write_iterator =
       thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
   auto counting = thrust::make_counting_iterator(0llu);
   auto input_iterator =
       dh::MakeTransformIterator<IndexFlagTuple>(counting, [=] __device__(size_t idx) {
-        const PerNodeData<OpDataT>* batch_info_itr =
-            reinterpret_cast<const PerNodeData<OpDataT>*>(constant_memory);
         int batch_idx;
         std::size_t item_idx;
         AssignBatch(batch_info_itr, idx, &batch_idx, &item_idx);
@@ -173,7 +149,7 @@ void SortPositionBatch(common::Span<RowIndexT> ridx, common::Span<RowIndexT> rid
   const int grid_size = xgboost::common::DivRoundUp(total_rows, kBlockSize * kItemsThread);
 
   SortPositionCopyKernel<kBlockSize, RowIndexT, OpDataT>
-      <<<grid_size, kBlockSize, 0, stream>>>(ridx, ridx_tmp, total_rows);
+      <<<grid_size, kBlockSize, 0, stream>>>(batch_info_itr, ridx, ridx_tmp, total_rows);
 }
 
 struct NodePositionInfo {
@@ -294,25 +270,24 @@ class RowPartitioner {
     CHECK_EQ(nidx.size(), op_data.size());
 
     auto h_batch_info = pinned2_.GetSpan<PerNodeData<OpDataT>>(nidx.size());
+    dh::TemporaryArray<PerNodeData<OpDataT>> d_batch_info(nidx.size());
 
     std::size_t total_rows = 0;
     for (int i = 0; i < nidx.size(); i++) {
       h_batch_info[i] = {ridx_segments_.at(nidx.at(i)).segment, op_data.at(i)};
       total_rows += ridx_segments_.at(nidx.at(i)).segment.Size();
     }
-    static_assert(sizeof(PerNodeData<OpDataT>) * kMaxUpdatePositionBatchSize <=
-                  sizeof(constant_memory),"Not enough constant memory allocated.") ;
-    dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(),
-                                          h_batch_info.size() * sizeof(PerNodeData<OpDataT>), 0,
-                                          cudaMemcpyDefault, stream_));
+    dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
+                                  h_batch_info.size() * sizeof(PerNodeData<OpDataT>),
+                                  cudaMemcpyDefault, stream_));
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
 
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
-        dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_), total_rows, op, &tmp_,
-        stream_);
+        dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_),
+        total_rows, op, &tmp_, stream_);
     dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts_.data().get(),
                                   sizeof(decltype(d_counts_)::value_type) * h_counts.size(),
                                   cudaMemcpyDefault, stream_));
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index d35178c643c3..520cc3cd0b81 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -67,12 +67,13 @@ void TestSortPositionBatch(const std::vector<int>& ridx_in, const std::vector<Se
     h_batch_info[i] = {segments.at(i), 0};
     total_rows += segments.at(i).Size();
   }
-  dh::safe_cuda(cudaMemcpyToSymbolAsync(constant_memory, h_batch_info.data(),
-                                        h_batch_info.size() * sizeof(PerNodeData<int>), 0,
-                                        cudaMemcpyDefault, nullptr));
+  dh::safe_cuda(cudaMemcpyAsync(d_batch_info.data().get(), h_batch_info.data(),
+                                h_batch_info.size() * sizeof(PerNodeData<int>), cudaMemcpyDefault,
+                                nullptr));
   dh::device_vector<int8_t> tmp;
-  SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(ridx), dh::ToSpan(ridx_tmp),
-                                                 dh::ToSpan(counts), total_rows, op, &tmp,nullptr);
+  SortPositionBatch<uint32_t, decltype(op), int>(dh::ToSpan(d_batch_info), dh::ToSpan(ridx),
+                                                 dh::ToSpan(ridx_tmp), dh::ToSpan(counts),
+                                                 total_rows, op, &tmp, nullptr);
 
   auto op_without_data = [=] __device__(auto ridx) { return ridx % 2 == 0; };
   for (int i = 0; i < segments.size(); i++) {

From 33fea3dcbfc560c1643d641098d325e1788a7172 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 28 Jun 2022 05:43:46 -0700
Subject: [PATCH 61/64] Clang tidy

---
 src/common/device_helpers.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 33989a230464..a0d94886da60 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1943,20 +1943,20 @@ class CUDAStream {
 // Force nvcc to load data as constant
 template <typename T>
 class LDGIterator {
-  typedef typename cub::UnitWord<T>::DeviceWord DeviceWordT;
+  using DeviceWordT = typename cub::UnitWord<T>::DeviceWord;
   static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT);
 
-  const T* ptr;
+  const T *ptr_;
 
  public:
-  LDGIterator(const T* ptr) : ptr(ptr) {}
+  LDGIterator(const T *ptr) : ptr_(ptr) {}
   __device__ T operator[](std::size_t idx) const {
     DeviceWordT tmp[kNumWords];
 #pragma unroll
     for (int i = 0; i < kNumWords; i++) {
-      tmp[i] = __ldg(reinterpret_cast<const DeviceWordT*>(ptr + idx) + i);
+      tmp[i] = __ldg(reinterpret_cast<const DeviceWordT *>(ptr_ + idx) + i);
     }
-    return *reinterpret_cast<const T*>(tmp);
+    return *reinterpret_cast<const T *>(tmp);
   }
 };
 }  // namespace dh

From 9de06928b38712e979de8252f38feed77388cb47 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Tue, 28 Jun 2022 05:58:59 -0700
Subject: [PATCH 62/64] Clang tidy

---
 src/common/device_helpers.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index a0d94886da60..738cf8de2106 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1949,7 +1949,7 @@ class LDGIterator {
   const T *ptr_;
 
  public:
-  LDGIterator(const T *ptr) : ptr_(ptr) {}
+  explicit LDGIterator(const T *ptr) : ptr_(ptr) {}
   __device__ T operator[](std::size_t idx) const {
     DeviceWordT tmp[kNumWords];
 #pragma unroll

From 3cd5e41b8e7776f06cc803cbbdd85c7c8abf3b4c Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Wed, 29 Jun 2022 06:08:32 -0700
Subject: [PATCH 63/64] Review comments.

---
 src/common/device_helpers.cuh | 1 +
 src/tree/updater_gpu_hist.cu  | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 738cf8de2106..ccec859a286c 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1952,6 +1952,7 @@ class LDGIterator {
   explicit LDGIterator(const T *ptr) : ptr_(ptr) {}
   __device__ T operator[](std::size_t idx) const {
     DeviceWordT tmp[kNumWords];
+    static_assert(sizeof(tmp) == sizeof(T), "Expect sizes to be equal.");
 #pragma unroll
     for (int i = 0; i < kNumWords; i++) {
       tmp[i] = __ldg(reinterpret_cast<const DeviceWordT *>(ptr_ + idx) + i);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 7074359131df..5eaaeecbadf6 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -410,6 +410,9 @@ struct GPUHistMakerDevice {
                         HostDeviceVector<bst_node_t>* p_out_position) {
     // Prediction cache will not be used with external memory
     if (!p_fmat->SingleColBlock()) {
+      if (task.UpdateTreeLeaf()) {
+        LOG(FATAL) << "Current objective function can not be used with external memory.";
+      }
       p_out_position->Resize(0);
       update_predictions.clear();
       return;

From 9eddfce0077de0db552d70833fe971896f842e02 Mon Sep 17 00:00:00 2001
From: Rory Mitchell <r.a.mitchell.nz@gmail.com>
Date: Thu, 30 Jun 2022 04:54:12 -0700
Subject: [PATCH 64/64] Initialise memory in case zero training rows.

---
 src/tree/gpu_hist/row_partitioner.cu  | 5 +----
 src/tree/gpu_hist/row_partitioner.cuh | 8 +++-----
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 540c07a6fe64..015d817f3640 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -14,10 +14,7 @@ namespace xgboost {
 namespace tree {
 
 RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
-    : device_idx_(device_idx),
-      ridx_(num_rows),
-      ridx_tmp_(num_rows),
-      d_counts_(kMaxUpdatePositionBatchSize) {
+    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
   dh::safe_cuda(cudaSetDevice(device_idx_));
   ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
   thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index e9fb7e86add7..4ba0bd27fe2f 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -218,7 +218,6 @@ class RowPartitioner {
   dh::TemporaryArray<RowIndexT> ridx_;
   // Staging area for sorting ridx
   dh::TemporaryArray<RowIndexT> ridx_tmp_;
-  dh::TemporaryArray<bst_uint> d_counts_;
   dh::device_vector<int8_t> tmp_;
   dh::PinnedMemory pinned_;
   dh::PinnedMemory pinned2_;
@@ -283,13 +282,13 @@ class RowPartitioner {
 
     // Temporary arrays
     auto h_counts = pinned_.GetSpan<bst_uint>(nidx.size(), 0);
+    dh::TemporaryArray<bst_uint> d_counts(nidx.size(), 0);
 
     // Partition the rows according to the operator
     SortPositionBatch<RowIndexT, UpdatePositionOpT, OpDataT>(
-        dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts_),
+        dh::ToSpan(d_batch_info), dh::ToSpan(ridx_), dh::ToSpan(ridx_tmp_), dh::ToSpan(d_counts),
         total_rows, op, &tmp_, stream_);
-    dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts_.data().get(),
-                                  sizeof(decltype(d_counts_)::value_type) * h_counts.size(),
+    dh::safe_cuda(cudaMemcpyAsync(h_counts.data(), d_counts.data().get(), h_counts.size_bytes(),
                                   cudaMemcpyDefault, stream_));
     // TODO(Rory): this synchronisation hurts performance a lot
     // Future optimisation should find a way to skip this
@@ -300,7 +299,6 @@ class RowPartitioner {
       auto segment = ridx_segments_.at(nidx[i]).segment;
       auto left_count = h_counts[i];
       CHECK_LE(left_count, segment.Size());
-      CHECK_GE(left_count, 0);
       ridx_segments_.resize(std::max(static_cast<bst_node_t>(ridx_segments_.size()),
                                      std::max(left_nidx[i], right_nidx[i]) + 1));
       ridx_segments_[nidx[i]] = NodePositionInfo{segment, left_nidx[i], right_nidx[i]};