From 1c9b7586e6d61e41209f2f85ce1297665a8a7275 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 23 Feb 2022 12:23:08 +0800
Subject: [PATCH 1/3] Support categorical data for hist.

---
 doc/parameter.rst                           |   7 +-
 doc/tutorials/categorical.rst               |  83 +++----
 include/xgboost/tree_model.h                |  10 +
 python-package/xgboost/core.py              |   7 +-
 python-package/xgboost/sklearn.py           |  17 +-
 src/common/partition_builder.h              |  82 ++++---
 src/common/threading_utils.h                |   3 -
 src/tree/hist/evaluate_splits.h             |  14 +-
 src/tree/updater_quantile_hist.cc           | 209 +++--------------
 src/tree/updater_quantile_hist.h            | 163 ++++++++++---
 tests/cpp/tree/hist/test_evaluate_splits.cc |  18 +-
 tests/cpp/tree/test_approx.cc               |  16 +-
 tests/cpp/tree/test_partitioner.h           |  21 ++
 tests/cpp/tree/test_quantile_hist.cc        | 240 +++++---------------
 tests/python/test_updaters.py               |   1 +
 15 files changed, 393 insertions(+), 498 deletions(-)
 create mode 100644 tests/cpp/tree/test_partitioner.h

diff --git a/doc/parameter.rst b/doc/parameter.rst
index 227263e7da61..992bd9f5af8b 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -244,9 +244,6 @@ Additional parameters for ``hist``, ``gpu_hist`` and ``approx`` tree method
 
   - Use single precision to build histograms instead of double precision.
 
-Additional parameters for ``approx`` and ``gpu_hist`` tree method
-=================================================================
-
 * ``max_cat_to_onehot``
 
   .. versionadded:: 1.6
@@ -256,8 +253,8 @@ Additional parameters for ``approx`` and ``gpu_hist`` tree method
   - A threshold for deciding whether XGBoost should use one-hot encoding based split for
     categorical data.  When number of categories is lesser than the threshold then one-hot
     encoding is chosen, otherwise the categories will be partitioned into children nodes.
-    Only relevant for regression and binary classification. Also, `approx` or `gpu_hist`
-    tree method is required.
+    Only relevant for regression and binary classification. Also, ``exact`` tree method is
+    not supported
 
 Additional parameters for Dart Booster (``booster=dart``)
 =========================================================
diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index 65081be57030..7a185a113116 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -4,16 +4,16 @@ Categorical Data
 
 .. note::
 
-   As of XGBoost 1.6, the feature is highly experimental and has limited features
+   As of XGBoost 1.6, the feature is experimental and has limited features
 
 Starting from version 1.5, XGBoost has experimental support for categorical data available
-for public testing.  At the moment, the support is implemented as one-hot encoding based
-categorical tree splits.  For numerical data, the split condition is defined as
-:math:`value < threshold`, while for categorical data the split is defined as :math:`value
-== category` and ``category`` is a discrete value.  More advanced categorical split
-strategy is planned for future releases and this tutorial details how to inform XGBoost
-about the data type.  Also, the current support for training is limited to ``gpu_hist``
-tree method.
+for public testing. For numerical data, the split condition is defined as :math:`value <
+threshold`, while for categorical data the split is defined depending on whether
+partitioning or onehot encoding is used. For partition-based splits, the splits are
+specified as :math:`value \in categories`, where ``categories`` is the set of categories
+in one feature.  If onehot encoding is used instead, then the split is defined as
+:math:`value == category`. More advanced categorical split strategy is planned for future
+releases and this tutorial details how to inform XGBoost about the data type.
 
 ************************************
 Training with scikit-learn Interface
@@ -35,13 +35,13 @@ parameter ``enable_categorical``:
 
 .. code:: python
 
-  # Only gpu_hist is supported for categorical data as mentioned previously
+  # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
   clf = xgb.XGBClassifier(
       tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False
   )
   # X is the dataframe we created in previous snippet
   clf.fit(X, y)
-  # Must use JSON for serialization, otherwise the information is lost
+  # Must use JSON/UBJSON for serialization, otherwise the information is lost.
   clf.save_model("categorical-model.json")
 
 
@@ -60,11 +60,37 @@ can plot the model and calculate the global feature importance:
 
 
 The ``scikit-learn`` interface from dask is similar to single node version.  The basic
-idea is create dataframe with category feature type, and tell XGBoost to use ``gpu_hist``
-with parameter ``enable_categorical``.  See :ref:`sphx_glr_python_examples_categorical.py`
-for a worked example of using categorical data with ``scikit-learn`` interface.  A
-comparison between using one-hot encoded data and XGBoost's categorical data support can
-be found :ref:`sphx_glr_python_examples_cat_in_the_dat.py`.
+idea is create dataframe with category feature type, and tell XGBoost to use it by setting
+the ``enable_categorical`` parameter.  See :ref:`sphx_glr_python_examples_categorical.py`
+for a worked example of using categorical data with ``scikit-learn`` interface with
+one-hot encoding.  A comparison between using one-hot encoded data and XGBoost's
+categorical data support can be found :ref:`sphx_glr_python_examples_cat_in_the_dat.py`.
+
+
+********************
+Optimal Partitioning
+********************
+
+.. versionadded:: 1.6
+
+Optimal partitioning is a technique for partitioning the categorical predictors for each
+node split, the proof of optimality for numerical objectives like ``RMSE`` was first
+introduced by `[1] <#references>`__. The algorithm is used in decision trees for handling
+regression and binary classification tasks `[2] <#references>`__, later LightGBM `[3]
+<#references>`__ brought it to the context of gradient boosting trees and now is also
+adopted in XGBoost as an optional feature for handling categorical splits. More
+specifically, the proof by Fisher `[1] <#references>`__ states that, when trying to
+partition a set of discrete values into groups based on the distances between a measure of
+these values, one only needs to look at sorted partitions instead of enumerating all
+possible permutations. In the context of decision trees, the discrete values are
+categories, and the measure is the output leaf value.  Intuitively, we want to group the
+categories that output similar leaf values. During split finding, we first sort the
+gradient histogram to prepare the contiguous partitions then enumerate the splits
+according to these sorted values. One of the related parameters for XGBoost is
+``max_cat_to_one_hot``, which controls whether one-hot encoding or partitioning should be
+used for each feature, see :doc:`/parameter` for details.  When objective is not
+regression or binary classification, XGBoost will fallback to using onehot encoding
+instead.
 
 
 **********************
@@ -82,7 +108,7 @@ categorical data, we need to pass the similar parameter to :class:`DMatrix
 
   # X is a dataframe we created in previous snippet
   Xy = xgb.DMatrix(X, y, enable_categorical=True)
-  booster = xgb.train({"tree_method": "gpu_hist"}, Xy)
+  booster = xgb.train({"tree_method": "hist", "max_cat_to_onehot": 5}, Xy)
   # Must use JSON for serialization, otherwise the information is lost
   booster.save_model("categorical-model.json")
 
@@ -109,30 +135,7 @@ types by using the ``feature_types`` parameter in :class:`DMatrix <xgboost.DMatr
 
 For numerical data, the feature type can be ``"q"`` or ``"float"``, while for categorical
 feature it's specified as ``"c"``.  The Dask module in XGBoost has the same interface so
-:class:`dask.Array <dask.Array>` can also be used as categorical data.
-
-********************
-Optimal Partitioning
-********************
-
-.. versionadded:: 1.6
-
-Optimal partitioning is a technique for partitioning the categorical predictors for each
-node split, the proof of optimality for numerical objectives like ``RMSE`` was first
-introduced by `[1] <#references>`__. The algorithm is used in decision trees for handling
-regression and binary classification tasks `[2] <#references>`__, later LightGBM `[3]
-<#references>`__ brought it to the context of gradient boosting trees and now is also
-adopted in XGBoost as an optional feature for handling categorical splits. More
-specifically, the proof by Fisher `[1] <#references>`__ states that, when trying to
-partition a set of discrete values into groups based on the distances between a measure of
-these values, one only needs to look at sorted partitions instead of enumerating all
-possible permutations. In the context of decision trees, the discrete values are
-categories, and the measure is the output leaf value.  Intuitively, we want to group the
-categories that output similar leaf values. During split finding, we first sort the
-gradient histogram to prepare the contiguous partitions then enumerate the splits
-according to these sorted values. One of the related parameters for XGBoost is
-``max_cat_to_one_hot``, which controls whether one-hot encoding or partitioning should be
-used for each feature, see :doc:`/parameter` for details.
+:class:`dask.Array <dask.Array>` can also be used for categorical data.
 
 *************
 Miscellaneous
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index 5c23f278a83b..b2d2ad3383de 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -604,6 +604,16 @@ class RegTree : public Model {
    */
   std::vector<FeatureType> const &GetSplitTypes() const { return split_types_; }
   common::Span<uint32_t const> GetSplitCategories() const { return split_categories_; }
+  /*!
+   * \brief Get the bit storage for categories
+   */
+  common::Span<uint32_t const> NodeCats(bst_node_t nidx) const {
+    auto node_ptr = GetCategoriesMatrix().node_ptr;
+    auto categories = GetCategoriesMatrix().categories;
+    auto segment = node_ptr[nidx];
+    auto node_cats = categories.subspan(segment.beg, segment.size);
+    return node_cats;
+  }
   auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }
 
   // The fields of split_categories_segments_[i] are set such that
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 009dea9904a0..f22371ab867a 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -582,10 +582,11 @@ def __init__(
 
             .. versionadded:: 1.3.0
 
+            .. note:: This parameter is experimental
+
             Experimental support of specializing for categorical features.  Do not set
-            to True unless you are interested in development.  Currently it's only
-            available for `gpu_hist` and `approx` tree methods. Also, JSON/UBJSON
-            serialization format is required. (XGBoost 1.6 for approx)
+            to True unless you are interested in development. Also, JSON/UBJSON
+            serialization format is required.
 
         """
         if group is not None and qid is not None:
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index b7bfe8b3219e..6efbf7cd36f7 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -206,10 +206,11 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
 
         .. versionadded:: 1.5.0
 
-        Experimental support for categorical data.  Do not set to true unless you are
-        interested in development. Only valid when `gpu_hist` or `approx` is used along
-        with dataframe as input.  Also, JSON/UBJSON serialization format is
-        required. (XGBoost 1.6 for approx)
+        .. note:: This parameter is experimental
+
+        Experimental support for categorical data.  When enabled, cudf/pandas.DataFrame
+        should be used to specify categorical data type.  Also, JSON/UBJSON
+        serialization format is required.
 
     max_cat_to_onehot : Optional[int]
 
@@ -220,9 +221,8 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
         A threshold for deciding whether XGBoost should use one-hot encoding based split
         for categorical data.  When number of categories is lesser than the threshold
         then one-hot encoding is chosen, otherwise the categories will be partitioned
-        into children nodes.  Only relevant for regression and binary
-        classification. Also, ``approx`` or ``gpu_hist`` tree method is required.  See
-        :doc:`Categorical Data </tutorials/categorical>` for details.
+        into children nodes.  Only relevant for regression and binary classification.
+        See :doc:`Categorical Data </tutorials/categorical>` for details.
 
     eval_metric : Optional[Union[str, List[str], Callable]]
 
@@ -846,7 +846,8 @@ def _duplicated(parameter: str) -> None:
         callbacks = self.callbacks if self.callbacks is not None else callbacks
 
         tree_method = params.get("tree_method", None)
-        if self.enable_categorical and tree_method not in ("gpu_hist", "approx"):
+        cat_support = {"gpu_hist", "approx", "hist"}
+        if self.enable_categorical and tree_method not in cat_support:
             raise ValueError(
                 "Experimental support for categorical data is not implemented for"
                 " current tree method yet."
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 5235ea3b9404..0fdde231f731 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2021 by Contributors
+ * Copyright 2021-2022 by Contributors
  * \file row_set.h
  * \brief Quick Utility to compute subset of rows
  * \author Philip Cho, Tianqi Chen
@@ -8,12 +8,15 @@
 #define XGBOOST_COMMON_PARTITION_BUILDER_H_
 
 #include <xgboost/data.h>
+
 #include <algorithm>
-#include <vector>
-#include <utility>
 #include <memory>
+#include <utility>
+#include <vector>
+
+#include "categorical.h"
+#include "column_matrix.h"
 #include "xgboost/tree_model.h"
-#include "../common/column_matrix.h"
 
 namespace xgboost {
 namespace common {
@@ -46,18 +49,20 @@ class PartitionBuilder {
   // on comparison of indexes values (idx_span) and split point (split_cond)
   // Handle dense columns
   // Analog of std::stable_partition, but in no-inplace manner
-  template <bool default_left, bool any_missing, typename ColumnType>
+  template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
   inline std::pair<size_t, size_t> PartitionKernel(const ColumnType& column,
-        common::Span<const size_t> rid_span, const int32_t split_cond,
-        common::Span<size_t> left_part, common::Span<size_t> right_part) {
+                                                   common::Span<const size_t> row_indices,
+                                                   common::Span<size_t> left_part,
+                                                   common::Span<size_t> right_part,
+                                                   size_t base_rowid, Predicate&& pred) {
     size_t* p_left_part = left_part.data();
     size_t* p_right_part = right_part.data();
     size_t nleft_elems = 0;
     size_t nright_elems = 0;
-    auto state = column.GetInitialState(rid_span.front());
+    auto state = column.GetInitialState(row_indices.front() - base_rowid);
 
-    for (auto rid : rid_span) {
-      const int32_t bin_id = column.GetBinIdx(rid, &state);
+    for (auto rid : row_indices) {
+      const int32_t bin_id = column.GetBinIdx(rid - base_rowid, &state);
       if (any_missing && bin_id == ColumnType::kMissingId) {
         if (default_left) {
           p_left_part[nleft_elems++] = rid;
@@ -65,7 +70,7 @@ class PartitionBuilder {
           p_right_part[nright_elems++] = rid;
         }
       } else {
-        if (bin_id <= split_cond) {
+        if (pred(rid, bin_id)) {
           p_left_part[nleft_elems++] = rid;
         } else {
           p_right_part[nright_elems++] = rid;
@@ -97,39 +102,64 @@ class PartitionBuilder {
 
   template <typename BinIdxType, bool any_missing>
   void Partition(const size_t node_in_set, const size_t nid, const common::Range1d range,
-                       const int32_t split_cond,
-                       const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) {
+                 const int32_t split_cond, GHistIndexMatrix const& gmat,
+                 const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) {
     common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left  = GetLeftBuffer(node_in_set,
-                                                                  range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set,
-                                                                  range.begin(), range.end());
+    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     const bst_uint fid = tree[nid].SplitIndex();
     const bool default_left = tree[nid].DefaultLeft();
     const auto column_ptr = column_matrix.GetColumn<BinIdxType, any_missing>(fid);
 
-    std::pair<size_t, size_t> child_nodes_sizes;
+    bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
+    auto node_cats = tree.NodeCats(nid);
+
+    auto const& index = gmat.index;
+    auto const& cut_values = gmat.cut.Values();
+    auto cut_ptrs = gmat.cut.Ptrs();
+
+    auto pred = [&](auto ridx, auto bin_id) {
+      bool go_left;
+      if (is_cat) {
+        auto begin = gmat.RowIdx(ridx);
+        auto end = gmat.RowIdx(ridx + 1);
+        auto f_begin = cut_ptrs[fid];
+        auto f_end = cut_ptrs[fid + 1];
+        // bypassing the column matrix as we need the cut value instead of bin idx for categorical
+        // features.
+        auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end);
+        if (gidx == -1) {
+          go_left = default_left;
+        } else {
+          go_left = Decision(node_cats, cut_values[gidx], default_left);
+        }
+      } else {
+        go_left = bin_id <= split_cond;
+      }
+      return go_left;
+    };
 
+    std::pair<size_t, size_t> child_nodes_sizes;
     if (column_ptr->GetType() == xgboost::common::kDenseColumn) {
       const common::DenseColumn<BinIdxType, any_missing>& column =
             static_cast<const common::DenseColumn<BinIdxType, any_missing>& >(*(column_ptr.get()));
       if (default_left) {
-        child_nodes_sizes = PartitionKernel<true, any_missing>(column, rid_span,
-                                                              split_cond, left, right);
+        child_nodes_sizes = PartitionKernel<true, any_missing>(column, rid_span, left, right,
+                                                               gmat.base_rowid, pred);
       } else {
-        child_nodes_sizes = PartitionKernel<false, any_missing>(column, rid_span,
-                                                                split_cond, left, right);
+        child_nodes_sizes = PartitionKernel<false, any_missing>(column, rid_span, left, right,
+                                                                gmat.base_rowid, pred);
       }
     } else {
       CHECK_EQ(any_missing, true);
       const common::SparseColumn<BinIdxType>& column
         = static_cast<const common::SparseColumn<BinIdxType>& >(*(column_ptr.get()));
       if (default_left) {
-        child_nodes_sizes = PartitionKernel<true, any_missing>(column, rid_span,
-                                                        split_cond, left, right);
+        child_nodes_sizes = PartitionKernel<true, any_missing>(column, rid_span, left, right,
+                                                               gmat.base_rowid, pred);
       } else {
-        child_nodes_sizes = PartitionKernel<false, any_missing>(column, rid_span,
-                                                        split_cond, left, right);
+        child_nodes_sizes = PartitionKernel<false, any_missing>(column, rid_span, left, right,
+                                                                gmat.base_rowid, pred);
       }
     }
 
diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h
index 4691fce7cd95..75e9ba5b3b18 100644
--- a/src/common/threading_utils.h
+++ b/src/common/threading_utils.h
@@ -275,9 +275,6 @@ class MemStackAllocator {
   T& operator[](size_t i) { return ptr_[i]; }
   T const& operator[](size_t i) const { return ptr_[i]; }
 
-  // FIXME(jiamingy): Remove this once we merge partitioner cleanup for hist.
-  auto Get() { return ptr_; }
-
  private:
   T* ptr_ = nullptr;
   size_t required_size_;
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index ed3f14e605da..169d1cdc541a 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -288,10 +288,10 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
     auto base_weight =
         evaluator.CalcWeight(candidate.nid, param_, GradStats{parent_sum});
 
-    auto left_weight = evaluator.CalcWeight(
-        candidate.nid, param_, GradStats{candidate.split.left_sum});
-    auto right_weight = evaluator.CalcWeight(
-        candidate.nid, param_, GradStats{candidate.split.right_sum});
+    auto left_weight =
+        evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.left_sum});
+    auto right_weight =
+        evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.right_sum});
 
     if (candidate.split.is_cat) {
       std::vector<uint32_t> split_cats;
@@ -308,11 +308,11 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
         split_cats = candidate.split.cat_bits;
         common::CatBitField cat_bits{split_cats};
       }
-
       tree.ExpandCategorical(
           candidate.nid, candidate.split.SplitIndex(), split_cats, candidate.split.DefaultLeft(),
-          base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(),
-          candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
+          base_weight, left_weight * param_.learning_rate, right_weight * param_.learning_rate,
+          candidate.split.loss_chg, parent_sum.GetHess(), candidate.split.left_sum.GetHess(),
+          candidate.split.right_sum.GetHess());
     } else {
       tree.ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value,
                       candidate.split.DefaultLeft(), base_weight,
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 616d1c5718ce..60b501b7c4a4 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -124,11 +124,12 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
   nodes_for_subtraction_trick_.clear();
   nodes_for_explicit_hist_build_.push_back(node);
 
+  auto const& row_set_collection = partitioner_.front().Partitions();
   size_t page_id = 0;
   for (auto const& gidx :
        p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
     this->histogram_builder_->BuildHist(
-        page_id, gidx, p_tree, row_set_collection_,
+        page_id, gidx, p_tree, row_set_collection,
         nodes_for_explicit_hist_build_, nodes_for_subtraction_trick_, gpair_h);
     ++page_id;
   }
@@ -149,7 +150,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
         grad_stat.Add(et.GetGrad(), et.GetHess());
       }
     } else {
-      const common::RowSetCollection::Elem e = row_set_collection_[nid];
+      const common::RowSetCollection::Elem e = row_set_collection[nid];
       for (const size_t *it = e.begin; it < e.end; ++it) {
         grad_stat.Add(gpair_h[*it].GetGrad(), gpair_h[*it].GetHess());
       }
@@ -204,6 +205,7 @@ void QuantileHistMaker::Builder<GradientSumT>::SplitSiblings(
     const std::vector<CPUExpandEntry> &nodes_for_apply_split,
     std::vector<CPUExpandEntry> *nodes_to_evaluate, RegTree *p_tree) {
   builder_monitor_.Start("SplitSiblings");
+  auto const& row_set_collection = this->partitioner_.front().Partitions();
   for (auto const& entry : nodes_for_apply_split) {
     int nid = entry.nid;
 
@@ -213,7 +215,7 @@ void QuantileHistMaker::Builder<GradientSumT>::SplitSiblings(
     const CPUExpandEntry right_node = CPUExpandEntry(cright, p_tree->GetDepth(cright), 0.0);
     nodes_to_evaluate->push_back(left_node);
     nodes_to_evaluate->push_back(right_node);
-    if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) {
+    if (row_set_collection[cleft].Size() < row_set_collection[cright].Size()) {
       nodes_for_explicit_hist_build_.push_back(left_node);
       nodes_for_subtraction_trick_.push_back(right_node);
     } else {
@@ -253,16 +255,18 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
     AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split);
 
     if (nodes_for_apply_split.size() != 0) {
-      ApplySplit<any_missing>(nodes_for_apply_split, gmat, column_matrix, p_tree);
+      HistRowPartitioner &partitioner = this->partitioner_.front();
+      partitioner.UpdatePosition<any_missing>(this->ctx_, gmat, column_matrix,
+                                              nodes_for_apply_split, p_tree);
+
       SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree);
 
       if (param_.max_depth == 0 || depth < param_.max_depth) {
         size_t i = 0;
-        for (auto const& gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
-          this->histogram_builder_->BuildHist(
-              i, gidx, p_tree, row_set_collection_,
-              nodes_for_explicit_hist_build_, nodes_for_subtraction_trick_,
-              gpair_h);
+        for (auto const &gidx : p_fmat->GetBatches<GHistIndexMatrix>(HistBatch(param_))) {
+          this->histogram_builder_->BuildHist(i, gidx, p_tree, partitioner_.front().Partitions(),
+                                              nodes_for_explicit_hist_build_,
+                                              nodes_for_subtraction_trick_, gpair_h);
           ++i;
         }
       } else {
@@ -293,7 +297,7 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
 template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::Update(
     const GHistIndexMatrix &gmat,
-    const ColumnMatrix &column_matrix,
+    const common::ColumnMatrix &column_matrix,
     HostDeviceVector<GradientPair> *gpair,
     DMatrix *p_fmat, RegTree *p_tree) {
   builder_monitor_.Start("Update");
@@ -333,14 +337,14 @@ bool QuantileHistMaker::Builder<GradientSumT>::UpdatePredictionCache(
 
   CHECK_GT(out_preds.Size(), 0U);
 
-  size_t n_nodes = row_set_collection_.end() - row_set_collection_.begin();
-
-  common::BlockedSpace2d space(n_nodes, [&](size_t node) {
-    return row_set_collection_[node].Size();
-  }, 1024);
+  CHECK_EQ(partitioner_.size(), 1);
+  auto const &row_set_collection = this->partitioner_.front().Partitions();
+  size_t n_nodes = row_set_collection.end() - row_set_collection.begin();
+  common::BlockedSpace2d space(
+      n_nodes, [&](size_t node) { return partitioner_.front()[node].Size(); }, 1024);
   CHECK_EQ(out_preds.DeviceIdx(), GenericParameter::kCpuId);
   common::ParallelFor2d(space, this->ctx_->Threads(), [&](size_t node, common::Range1d r) {
-    const RowSetCollection::Elem rowset = row_set_collection_[node];
+    const common::RowSetCollection::Elem rowset = row_set_collection[node];
     if (rowset.begin != nullptr && rowset.end != nullptr) {
       int nid = rowset.node_id;
       bst_float leaf_value;
@@ -354,7 +358,7 @@ bool QuantileHistMaker::Builder<GradientSumT>::UpdatePredictionCache(
       }
       leaf_value = (*p_last_tree_)[nid].LeafValue();
 
-      for (const size_t* it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
+      for (const size_t *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
         out_preds(*it) += leaf_value;
       }
     }
@@ -364,10 +368,9 @@ bool QuantileHistMaker::Builder<GradientSumT>::UpdatePredictionCache(
   return true;
 }
 
-template<typename GradientSumT>
+template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::InitSampling(const DMatrix& fmat,
-                                                std::vector<GradientPair>* gpair,
-                                                std::vector<size_t>* row_indices) {
+                                                            std::vector<GradientPair>* gpair) {
   const auto& info = fmat.Info();
   auto& rnd = common::GlobalRandom();
   std::vector<GradientPair>& gpair_ref = *gpair;
@@ -410,101 +413,31 @@ template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::InitData(
     const GHistIndexMatrix &gmat, const DMatrix &fmat, const RegTree &tree,
     std::vector<GradientPair> *gpair) {
-  CHECK((param_.max_depth > 0 || param_.max_leaves > 0))
-      << "max_depth or max_leaves cannot be both 0 (unlimited); "
-      << "at least one should be a positive quantity.";
-  if (param_.grow_policy == TrainParam::kDepthWise) {
-    CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) "
-                                << "when grow_policy is depthwise.";
-  }
   builder_monitor_.Start("InitData");
   const auto& info = fmat.Info();
 
   {
-    // initialize the row set
-    row_set_collection_.Clear();
     // initialize histogram collection
     uint32_t nbins = gmat.cut.Ptrs().back();
     // initialize histogram builder
     dmlc::OMPException exc;
-    exc.Rethrow();
     this->histogram_builder_->Reset(nbins, BatchParam{GenericParameter::kCpuId, param_.max_bin},
                                     this->ctx_->Threads(), 1, rabit::IsDistributed());
 
-    std::vector<size_t>& row_indices = *row_set_collection_.Data();
-    row_indices.resize(info.num_row_);
-    size_t* p_row_indices = row_indices.data();
-    // mark subsample and build list of member rows
-
     if (param_.subsample < 1.0f) {
       CHECK_EQ(param_.sampling_method, TrainParam::kUniform)
         << "Only uniform sampling is supported, "
         << "gradient-based sampling is only support by GPU Hist.";
       builder_monitor_.Start("InitSampling");
-      InitSampling(fmat, gpair, &row_indices);
+      InitSampling(fmat, gpair);
       builder_monitor_.Stop("InitSampling");
-      CHECK_EQ(row_indices.size(), info.num_row_);
       // We should check that the partitioning was done correctly
       // and each row of the dataset fell into exactly one of the categories
     }
-    auto n_threads = this->ctx_->Threads();
-    common::MemStackAllocator<bool, 128> buff(n_threads);
-    bool* p_buff = buff.Get();
-    std::fill(p_buff, p_buff + this->ctx_->Threads(), false);
-
-    const size_t block_size = info.num_row_ / n_threads + !!(info.num_row_ % n_threads);
-
-#pragma omp parallel num_threads(n_threads)
-    {
-      exc.Run([&]() {
-        const size_t tid = omp_get_thread_num();
-        const size_t ibegin = tid * block_size;
-        const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
-            static_cast<size_t>(info.num_row_));
-
-        for (size_t i = ibegin; i < iend; ++i) {
-          if ((*gpair)[i].GetHess() < 0.0f) {
-            p_buff[tid] = true;
-            break;
-          }
-        }
-      });
-    }
-    exc.Rethrow();
-
-    bool has_neg_hess = false;
-    for (int32_t tid = 0; tid < n_threads; ++tid) {
-      if (p_buff[tid]) {
-        has_neg_hess = true;
-      }
-    }
-
-    if (has_neg_hess) {
-      size_t j = 0;
-      for (size_t i = 0; i < info.num_row_; ++i) {
-        if ((*gpair)[i].GetHess() >= 0.0f) {
-          p_row_indices[j++] = i;
-        }
-      }
-      row_indices.resize(j);
-    } else {
-      #pragma omp parallel num_threads(n_threads)
-      {
-        exc.Run([&]() {
-          const size_t tid = omp_get_thread_num();
-          const size_t ibegin = tid * block_size;
-          const size_t iend = std::min(static_cast<size_t>(ibegin + block_size),
-              static_cast<size_t>(info.num_row_));
-          for (size_t i = ibegin; i < iend; ++i) {
-            p_row_indices[i] = i;
-          }
-        });
-      }
-      exc.Rethrow();
-    }
   }
 
-  row_set_collection_.Init();
+  partitioner_.clear();
+  partitioner_.emplace_back(info.num_row_, 0, this->ctx_->Threads());
 
   {
     /* determine layout of data */
@@ -558,12 +491,9 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(
   builder_monitor_.Stop("InitData");
 }
 
-template <typename GradientSumT>
-void QuantileHistMaker::Builder<GradientSumT>::FindSplitConditions(
-                                                     const std::vector<CPUExpandEntry>& nodes,
-                                                     const RegTree& tree,
-                                                     const GHistIndexMatrix& gmat,
-                                                     std::vector<int32_t>* split_conditions) {
+void HistRowPartitioner::FindSplitConditions(const std::vector<CPUExpandEntry> &nodes,
+                                             const RegTree &tree, const GHistIndexMatrix &gmat,
+                                             std::vector<int32_t> *split_conditions) {
   const size_t n_nodes = nodes.size();
   split_conditions->resize(n_nodes);
 
@@ -576,8 +506,7 @@ void QuantileHistMaker::Builder<GradientSumT>::FindSplitConditions(
     int32_t split_cond = -1;
     // convert floating-point split_pt into corresponding bin_id
     // split_cond = -1 indicates that split_pt is less than all known cut points
-    CHECK_LT(upper_bound,
-             static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
+    CHECK_LT(upper_bound, static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
     for (uint32_t bound = lower_bound; bound < upper_bound; ++bound) {
       if (split_pt == gmat.cut.Values()[bound]) {
         split_cond = static_cast<int32_t>(bound);
@@ -586,88 +515,20 @@ void QuantileHistMaker::Builder<GradientSumT>::FindSplitConditions(
     (*split_conditions)[i] = split_cond;
   }
 }
-template <typename GradientSumT>
-void QuantileHistMaker::Builder<GradientSumT>::AddSplitsToRowSet(
-                                               const std::vector<CPUExpandEntry>& nodes,
-                                               RegTree* p_tree) {
+
+void HistRowPartitioner::AddSplitsToRowSet(const std::vector<CPUExpandEntry> &nodes,
+                                           RegTree const *p_tree) {
   const size_t n_nodes = nodes.size();
   for (unsigned int i = 0; i < n_nodes; ++i) {
     const int32_t nid = nodes[i].nid;
     const size_t n_left = partition_builder_.GetNLeftElems(i);
     const size_t n_right = partition_builder_.GetNRightElems(i);
     CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild());
-    row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(),
-        (*p_tree)[nid].RightChild(), n_left, n_right);
+    row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(),
+                                 n_left, n_right);
   }
 }
 
-template <typename GradientSumT>
-template <bool any_missing>
-void QuantileHistMaker::Builder<GradientSumT>::ApplySplit(const std::vector<CPUExpandEntry> nodes,
-                                            const GHistIndexMatrix& gmat,
-                                            const ColumnMatrix& column_matrix,
-                                            RegTree* p_tree) {
-  builder_monitor_.Start("ApplySplit");
-  // 1. Find split condition for each split
-  const size_t n_nodes = nodes.size();
-  std::vector<int32_t> split_conditions;
-  FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
-  // 2.1 Create a blocked space of size SUM(samples in each node)
-  common::BlockedSpace2d space(n_nodes, [&](size_t node_in_set) {
-    int32_t nid = nodes[node_in_set].nid;
-    return row_set_collection_[nid].Size();
-  }, kPartitionBlockSize);
-  // 2.2 Initialize the partition builder
-  // allocate buffers for storage intermediate results by each thread
-  partition_builder_.Init(space.Size(), n_nodes, [&](size_t node_in_set) {
-    const int32_t nid = nodes[node_in_set].nid;
-    const size_t size = row_set_collection_[nid].Size();
-    const size_t n_tasks = size / kPartitionBlockSize + !!(size % kPartitionBlockSize);
-    return n_tasks;
-  });
-  // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
-  // Store results in intermediate buffers from partition_builder_
-  common::ParallelFor2d(space, this->ctx_->Threads(), [&](size_t node_in_set, common::Range1d r) {
-    size_t begin = r.begin();
-    const int32_t nid = nodes[node_in_set].nid;
-    const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
-    partition_builder_.AllocateForTask(task_id);
-      switch (column_matrix.GetTypeSize()) {
-      case common::kUint8BinsTypeSize:
-        partition_builder_.template Partition<uint8_t, any_missing>(node_in_set, nid, r,
-                  split_conditions[node_in_set], column_matrix,
-                  *p_tree, row_set_collection_[nid].begin);
-        break;
-      case common::kUint16BinsTypeSize:
-        partition_builder_.template Partition<uint16_t, any_missing>(node_in_set, nid, r,
-                  split_conditions[node_in_set], column_matrix,
-                  *p_tree, row_set_collection_[nid].begin);
-        break;
-      case common::kUint32BinsTypeSize:
-        partition_builder_.template Partition<uint32_t, any_missing>(node_in_set, nid, r,
-                  split_conditions[node_in_set], column_matrix,
-                  *p_tree, row_set_collection_[nid].begin);
-        break;
-      default:
-        CHECK(false);  // no default behavior
-    }
-    });
-  // 3. Compute offsets to copy blocks of row-indexes
-  // from partition_builder_ to row_set_collection_
-  partition_builder_.CalculateRowOffsets();
-
-  // 4. Copy elements from partition_builder_ to row_set_collection_ back
-  // with updated row-indexes for each tree-node
-  common::ParallelFor2d(space, this->ctx_->Threads(), [&](size_t node_in_set, common::Range1d r) {
-    const int32_t nid = nodes[node_in_set].nid;
-    partition_builder_.MergeToArray(node_in_set, r.begin(),
-        const_cast<size_t*>(row_set_collection_[nid].begin));
-  });
-  // 5. Add info about splits into row_set_collection_
-  AddSplitsToRowSet(nodes, p_tree);
-  builder_monitor_.Stop("ApplySplit");
-}
-
 template struct QuantileHistMaker::Builder<float>;
 template struct QuantileHistMaker::Builder<double>;
 
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
index 09df175cd0f8..4213d5dc8862 100644
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -11,9 +11,9 @@
 #include <rabit/rabit.h>
 #include <xgboost/tree_updater.h>
 
-#include <iomanip>
+#include <algorithm>
+#include <limits>
 #include <memory>
-#include <queue>
 #include <string>
 #include <utility>
 #include <vector>
@@ -38,8 +38,6 @@
 #include "../common/column_matrix.h"
 
 namespace xgboost {
-
-
 struct RandomReplace {
  public:
   // similar value as for minstd_rand
@@ -82,15 +80,127 @@ struct RandomReplace {
 };
 
 namespace tree {
+class HistRowPartitioner {
+  // heuristically chosen block size of parallel partitioning
+  static constexpr size_t kPartitionBlockSize = 2048;
+  // worker class that partition a block of rows
+  common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
+  // storage for row index
+  common::RowSetCollection row_set_collection_;
+
+  /**
+   * \brief Turn split values into discrete bin indices.
+   */
+  static void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
+                                  const GHistIndexMatrix& gmat,
+                                  std::vector<int32_t>* split_conditions);
+  /**
+   * \brief Update the row set for new splits specifed by nodes.
+   */
+  void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree const* p_tree);
+
+ public:
+  bst_row_t base_rowid = 0;
+
+ public:
+  HistRowPartitioner(size_t n_samples, size_t base_rowid, int32_t n_threads) {
+    row_set_collection_.Clear();
+    const size_t block_size = n_samples / n_threads + !!(n_samples % n_threads);
+    dmlc::OMPException exc;
+    std::vector<size_t>& row_indices = *row_set_collection_.Data();
+    row_indices.resize(n_samples);
+    size_t* p_row_indices = row_indices.data();
+    // parallel initialization o f row indices. (std::iota)
+#pragma omp parallel num_threads(n_threads)
+    {
+      exc.Run([&]() {
+        const size_t tid = omp_get_thread_num();
+        const size_t ibegin = tid * block_size;
+        const size_t iend = std::min(static_cast<size_t>(ibegin + block_size), n_samples);
+        for (size_t i = ibegin; i < iend; ++i) {
+          p_row_indices[i] = i + base_rowid;
+        }
+      });
+    }
+    row_set_collection_.Init();
+    this->base_rowid = base_rowid;
+  }
 
-using xgboost::GHistIndexMatrix;
-using xgboost::common::GHistIndexRow;
-using xgboost::common::HistCollection;
-using xgboost::common::RowSetCollection;
-using xgboost::common::GHistRow;
-using xgboost::common::GHistBuilder;
-using xgboost::common::ColumnMatrix;
-using xgboost::common::Column;
+  template <bool any_missing>
+  void UpdatePosition(GenericParameter const* ctx, GHistIndexMatrix const& gmat,
+                      common::ColumnMatrix const& column_matrix,
+                      std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
+    // 1. Find split condition for each split
+    const size_t n_nodes = nodes.size();
+    std::vector<int32_t> split_conditions;
+    FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
+    // 2.1 Create a blocked space of size SUM(samples in each node)
+    common::BlockedSpace2d space(
+        n_nodes,
+        [&](size_t node_in_set) {
+          int32_t nid = nodes[node_in_set].nid;
+          return row_set_collection_[nid].Size();
+        },
+        kPartitionBlockSize);
+    // 2.2 Initialize the partition builder
+    // allocate buffers for storage intermediate results by each thread
+    partition_builder_.Init(space.Size(), n_nodes, [&](size_t node_in_set) {
+      const int32_t nid = nodes[node_in_set].nid;
+      const size_t size = row_set_collection_[nid].Size();
+      const size_t n_tasks = size / kPartitionBlockSize + !!(size % kPartitionBlockSize);
+      return n_tasks;
+    });
+    CHECK_EQ(base_rowid, gmat.base_rowid);
+    // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node
+    // Store results in intermediate buffers from partition_builder_
+    common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
+      size_t begin = r.begin();
+      const int32_t nid = nodes[node_in_set].nid;
+      const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin);
+      partition_builder_.AllocateForTask(task_id);
+      switch (column_matrix.GetTypeSize()) {
+        case common::kUint8BinsTypeSize:
+          partition_builder_.template Partition<uint8_t, any_missing>(
+              node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree,
+              row_set_collection_[nid].begin);
+          break;
+        case common::kUint16BinsTypeSize:
+          partition_builder_.template Partition<uint16_t, any_missing>(
+              node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree,
+              row_set_collection_[nid].begin);
+          break;
+        case common::kUint32BinsTypeSize:
+          partition_builder_.template Partition<uint32_t, any_missing>(
+              node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree,
+              row_set_collection_[nid].begin);
+          break;
+        default:
+          // no default behavior
+          CHECK(false) << column_matrix.GetTypeSize();
+      }
+    });
+    // 3. Compute offsets to copy blocks of row-indexes
+    // from partition_builder_ to row_set_collection_
+    partition_builder_.CalculateRowOffsets();
+
+    // 4. Copy elements from partition_builder_ to row_set_collection_ back
+    // with updated row-indexes for each tree-node
+    common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
+      const int32_t nid = nodes[node_in_set].nid;
+      partition_builder_.MergeToArray(node_in_set, r.begin(),
+                                      const_cast<size_t*>(row_set_collection_[nid].begin));
+    });
+    // 5. Add info about splits into row_set_collection_
+    AddSplitsToRowSet(nodes, p_tree);
+  }
+
+  auto const& Partitions() const { return row_set_collection_; }
+  size_t Size() const {
+    return std::distance(row_set_collection_.begin(), row_set_collection_.end());
+  }
+  auto& operator[](bst_node_t nidx) { return row_set_collection_[nidx]; }
+  auto const& operator[](bst_node_t nidx) const { return row_set_collection_[nidx]; }
+};
 
 inline BatchParam HistBatch(TrainParam const& param) {
   return {param.max_bin, param.sparse_threshold};
@@ -185,21 +295,7 @@ class QuantileHistMaker: public TreeUpdater {
 
     size_t GetNumberOfTrees();
 
-    void InitSampling(const DMatrix& fmat,
-                      std::vector<GradientPair>* gpair,
-                      std::vector<size_t>* row_indices);
-
-    template <bool any_missing>
-    void ApplySplit(std::vector<CPUExpandEntry> nodes,
-                        const GHistIndexMatrix& gmat,
-                        const ColumnMatrix& column_matrix,
-                        RegTree* p_tree);
-
-    void AddSplitsToRowSet(const std::vector<CPUExpandEntry>& nodes, RegTree* p_tree);
-
-
-    void FindSplitConditions(const std::vector<CPUExpandEntry>& nodes, const RegTree& tree,
-                             const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions);
+    void InitSampling(const DMatrix& fmat, std::vector<GradientPair>* gpair);
 
     template <bool any_missing>
     void InitRoot(DMatrix* p_fmat,
@@ -221,7 +317,7 @@ class QuantileHistMaker: public TreeUpdater {
 
     template <bool any_missing>
     void ExpandTree(const GHistIndexMatrix& gmat,
-                    const ColumnMatrix& column_matrix,
+                    const common::ColumnMatrix& column_matrix,
                     DMatrix* p_fmat,
                     RegTree* p_tree,
                     const std::vector<GradientPair>& gpair_h);
@@ -232,9 +328,6 @@ class QuantileHistMaker: public TreeUpdater {
     std::shared_ptr<common::ColumnSampler> column_sampler_{
         std::make_shared<common::ColumnSampler>()};
 
-    std::vector<size_t> unused_rows_;
-    // the internal row sets
-    RowSetCollection row_set_collection_;
     std::vector<GradientPair> gpair_local_;
 
     /*! \brief feature with least # of bins. to be used for dense specialization
@@ -243,12 +336,12 @@ class QuantileHistMaker: public TreeUpdater {
 
     std::unique_ptr<TreeUpdater> pruner_;
     std::unique_ptr<HistEvaluator<GradientSumT, CPUExpandEntry>> evaluator_;
-
-    static constexpr size_t kPartitionBlockSize = 2048;
-    common::PartitionBuilder<kPartitionBlockSize> partition_builder_;
+    // Right now there's only 1 partitioner in this vector, when external memory is fully
+    // supported we will have number of partitioners equal to number of pages.
+    std::vector<HistRowPartitioner> partitioner_;
 
     // back pointers to tree and data matrix
-    const RegTree* p_last_tree_;
+    const RegTree* p_last_tree_{nullptr};
     DMatrix const* const p_last_fmat_;
     DMatrix* p_last_fmat_mutable_;
 
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index f3760534b4ab..e46726e7401c 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -40,7 +40,7 @@ template <typename GradientSumT> void TestEvaluateSplits() {
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
 
-  auto hist_builder = GHistBuilder<GradientSumT>(gmat.cut.Ptrs().back());
+  auto hist_builder = common::GHistBuilder<GradientSumT>(gmat.cut.Ptrs().back());
   hist.Init(gmat.cut.Ptrs().back());
   hist.AddHistRow(0);
   hist.AllocateAllData();
@@ -94,7 +94,7 @@ TEST(HistEvaluator, Apply) {
   RegTree tree;
   int static constexpr kNRows = 8, kNCols = 16;
   TrainParam param;
-  param.UpdateAllowUnknown(Args{{}});
+  param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}});
   auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix();
   auto sampler = std::make_shared<common::ColumnSampler>();
   auto evaluator_ = HistEvaluator<float, CPUExpandEntry>{param, dmat->Info(), 4, sampler,
@@ -102,12 +102,22 @@ TEST(HistEvaluator, Apply) {
 
   CPUExpandEntry entry{0, 0, 10.0f};
   entry.split.left_sum = GradStats{0.4, 0.6f};
-  entry.split.right_sum = GradStats{0.5, 0.7f};
+  entry.split.right_sum = GradStats{0.5, 0.5f};
 
   evaluator_.ApplyTreeSplit(entry, &tree);
   ASSERT_EQ(tree.NumExtraNodes(), 2);
   ASSERT_EQ(tree.Stat(tree[0].LeftChild()).sum_hess, 0.6f);
-  ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.7f);
+  ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.5f);
+
+  {
+    RegTree tree;
+    entry.split.is_cat = true;
+    entry.split.split_value = 1.0;
+    evaluator_.ApplyTreeSplit(entry, &tree);
+    auto l = entry.split.left_sum;
+    ASSERT_NEAR(tree[1].LeafValue(), -l.sum_grad / l.sum_hess * param.learning_rate, kRtEps);
+    ASSERT_NEAR(tree[2].LeafValue(), -param.learning_rate, kRtEps);
+  }
 }
 
 TEST_F(TestPartitionBasedSplit, CPUHist) {
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index 639768b5eabb..a37c0973627e 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -1,26 +1,14 @@
 /*!
- * Copyright 2021 XGBoost contributors
+ * Copyright 2021-2022, XGBoost contributors.
  */
 #include <gtest/gtest.h>
 
 #include "../../../src/tree/updater_approx.h"
 #include "../helpers.h"
+#include "test_partitioner.h"
 
 namespace xgboost {
 namespace tree {
-namespace {
-void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntry> *candidates) {
-  tree->ExpandNode(
-      /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
-      /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
-      /*left_sum=*/0.0f,
-      /*right_sum=*/0.0f);
-  candidates->front().split.split_value = split_value;
-  candidates->front().split.sindex = 0;
-  candidates->front().split.sindex |= (1U << 31);
-}
-}  // anonymous namespace
-
 TEST(Approx, Partitioner) {
   size_t n_samples = 1024, n_features = 1, base_rowid = 0;
   ApproxRowPartitioner partitioner{n_samples, base_rowid};
diff --git a/tests/cpp/tree/test_partitioner.h b/tests/cpp/tree/test_partitioner.h
new file mode 100644
index 000000000000..109749a2832f
--- /dev/null
+++ b/tests/cpp/tree/test_partitioner.h
@@ -0,0 +1,21 @@
+/*!
+ * Copyright 2021-2022, XGBoost contributors.
+ */
+#include <xgboost/tree_model.h>
+#include <vector>
+#include "../../../src/tree/hist/expand_entry.h"
+
+namespace xgboost {
+namespace tree {
+inline void GetSplit(RegTree *tree, float split_value, std::vector<CPUExpandEntry> *candidates) {
+  tree->ExpandNode(
+      /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value,
+      /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+      /*left_sum=*/0.0f,
+      /*right_sum=*/0.0f);
+  candidates->front().split.split_value = split_value;
+  candidates->front().split.sindex = 0;
+  candidates->front().split.sindex |= (1U << 31);
+}
+}  // namespace tree
+}  // namespace xgboost
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index d043c5bb5a47..d2bebf0616d1 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -1,18 +1,19 @@
 /*!
  * Copyright 2018-2022 by XGBoost Contributors
  */
+#include <gtest/gtest.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/tree_updater.h>
-#include <gtest/gtest.h>
 
 #include <algorithm>
-#include <vector>
 #include <string>
+#include <vector>
 
-#include "../helpers.h"
 #include "../../../src/tree/param.h"
-#include "../../../src/tree/updater_quantile_hist.h"
 #include "../../../src/tree/split_evaluator.h"
+#include "../../../src/tree/updater_quantile_hist.h"
+#include "../helpers.h"
+#include "test_partitioner.h"
 #include "xgboost/data.h"
 
 namespace xgboost {
@@ -94,130 +95,6 @@ class QuantileHistMock : public QuantileHistMaker {
         }
       }
     }
-
-    void TestInitDataSampling(const GHistIndexMatrix& gmat,
-                      std::vector<GradientPair>* gpair,
-                      DMatrix* p_fmat,
-                      const RegTree& tree) {
-      // check SimpleSkip
-      size_t initial_seed = 777;
-      std::linear_congruential_engine<std::uint_fast64_t, 16807, 0,
-                                      static_cast<uint64_t>(1) << 63 > eng_first(initial_seed);
-      for (size_t i = 0; i < 100; ++i) {
-        eng_first();
-      }
-      uint64_t initial_seed_th = RandomReplace::SimpleSkip(100, initial_seed, 16807, RandomReplace::kMod);
-      std::linear_congruential_engine<std::uint_fast64_t, RandomReplace::kBase, 0,
-                                      RandomReplace::kMod > eng_second(initial_seed_th);
-      ASSERT_EQ(eng_first(), eng_second());
-
-      const size_t nthreads = omp_get_num_threads();
-      // save state of global rng engine
-      auto initial_rnd = common::GlobalRandom();
-      std::vector<size_t> unused_rows_cpy = this->unused_rows_;
-      RealImpl::InitData(gmat, *p_fmat, tree, gpair);
-      std::vector<size_t> row_indices_initial = *(this->row_set_collection_.Data());
-      std::vector<size_t> unused_row_indices_initial = this->unused_rows_;
-      ASSERT_EQ(row_indices_initial.size(), p_fmat->Info().num_row_);
-      auto check_each_row_occurs_in_one_of_arrays = [](const std::vector<size_t>& first,
-                                                       const std::vector<size_t>& second,
-                                                       size_t nrows) {
-        ASSERT_EQ(first.size(), nrows);
-        ASSERT_EQ(second.size(), 0);
-      };
-      check_each_row_occurs_in_one_of_arrays(row_indices_initial, unused_row_indices_initial,
-                                             p_fmat->Info().num_row_);
-
-      for (size_t i_nthreads = 1; i_nthreads < 4; ++i_nthreads) {
-        omp_set_num_threads(i_nthreads);
-        // return initial state of global rng engine
-        common::GlobalRandom() = initial_rnd;
-        this->unused_rows_ = unused_rows_cpy;
-        RealImpl::InitData(gmat, *p_fmat, tree, gpair);
-        std::vector<size_t>& row_indices = *(this->row_set_collection_.Data());
-        ASSERT_EQ(row_indices_initial.size(), row_indices.size());
-        for (size_t i = 0; i < row_indices_initial.size(); ++i) {
-          ASSERT_EQ(row_indices_initial[i], row_indices[i]);
-        }
-        std::vector<size_t>& unused_row_indices = this->unused_rows_;
-        ASSERT_EQ(unused_row_indices_initial.size(), unused_row_indices.size());
-        for (size_t i = 0; i < unused_row_indices_initial.size(); ++i) {
-          ASSERT_EQ(unused_row_indices_initial[i], unused_row_indices[i]);
-        }
-        check_each_row_occurs_in_one_of_arrays(row_indices, unused_row_indices,
-                                               p_fmat->Info().num_row_);
-      }
-      omp_set_num_threads(nthreads);
-    }
-
-    void TestApplySplit(const RegTree& tree) {
-      std::vector<GradientPair> row_gpairs =
-          { {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f},
-            {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} };
-      int32_t constexpr kMaxBins = 4;
-
-      // try out different sparsity to get different number of missing values
-      for (double sparsity : {0.0, 0.1, 0.2}) {
-        // kNRows samples with kNCols features
-        auto dmat = RandomDataGenerator(kNRows, kNCols, sparsity).Seed(3).GenerateDMatrix();
-
-        float sparse_th = 0.0;
-        GHistIndexMatrix gmat{dmat.get(), kMaxBins, sparse_th, false, common::OmpGetNumThreads(0)};
-        ColumnMatrix cm;
-
-        // treat everything as dense, as this is what we intend to test here
-        cm.Init(gmat, sparse_th, common::OmpGetNumThreads(0));
-        RealImpl::InitData(gmat, *dmat, tree, &row_gpairs);
-        const size_t num_row = dmat->Info().num_row_;
-        // split by feature 0
-        const size_t bin_id_min = gmat.cut.Ptrs()[0];
-        const size_t bin_id_max = gmat.cut.Ptrs()[1];
-
-        // attempt to split at different bins
-        for (size_t split = 0; split < 4; split++) {
-          size_t left_cnt = 0, right_cnt = 0;
-
-          // manually compute how many samples go left or right
-          for (size_t rid = 0; rid < num_row; ++rid) {
-            for (size_t offset = gmat.row_ptr[rid]; offset < gmat.row_ptr[rid + 1]; ++offset) {
-              const size_t bin_id = gmat.index[offset];
-              if (bin_id >= bin_id_min && bin_id < bin_id_max) {
-                if (bin_id <= split) {
-                  left_cnt++;
-                } else {
-                  right_cnt++;
-                }
-              }
-            }
-          }
-
-          // if any were missing due to sparsity, we add them to the left or to the right
-          size_t missing = kNRows - left_cnt - right_cnt;
-          if (tree[0].DefaultLeft()) {
-            left_cnt += missing;
-          } else {
-            right_cnt += missing;
-          }
-
-          // have one node with kNRows (=8 at the moment) rows, just one task
-          RealImpl::partition_builder_.Init(1, 1, [&](size_t node_in_set) {
-            return 1;
-          });
-          const size_t task_id = RealImpl::partition_builder_.GetTaskIdx(0, 0);
-          RealImpl::partition_builder_.AllocateForTask(task_id);
-          if (cm.AnyMissing()) {
-            RealImpl::partition_builder_.template Partition<uint8_t, true>(0, 0, common::Range1d(0, kNRows),
-                                                    split, cm, tree, this->row_set_collection_[0].begin);
-          } else {
-            RealImpl::partition_builder_.template Partition<uint8_t, false>(0, 0, common::Range1d(0, kNRows),
-                                                    split, cm, tree, this->row_set_collection_[0].begin);
-          }
-          RealImpl::partition_builder_.CalculateRowOffsets();
-          ASSERT_EQ(RealImpl::partition_builder_.GetNLeftElems(0), left_cnt);
-          ASSERT_EQ(RealImpl::partition_builder_.GetNRightElems(0), right_cnt);
-        }
-      }
-    }
   };
 
   int static constexpr kNRows = 8, kNCols = 16;
@@ -262,33 +139,6 @@ class QuantileHistMock : public QuantileHistMaker {
       float_builder_->TestInitData(gmat, &gpair, dmat_.get(), tree);
     }
   }
-
-  void TestInitDataSampling() {
-    int32_t constexpr kMaxBins = 4;
-    GHistIndexMatrix gmat{dmat_.get(), kMaxBins, 0.0f, false, common::OmpGetNumThreads(0)};
-
-    RegTree tree = RegTree();
-    tree.param.UpdateAllowUnknown(cfg_);
-
-    std::vector<GradientPair> gpair =
-        { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f},
-          {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} };
-    if (double_builder_) {
-      double_builder_->TestInitDataSampling(gmat, &gpair, dmat_.get(), tree);
-    } else {
-      float_builder_->TestInitDataSampling(gmat, &gpair, dmat_.get(), tree);
-    }
-  }
-
-  void TestApplySplit() {
-    RegTree tree = RegTree();
-    tree.param.UpdateAllowUnknown(cfg_);
-    if (double_builder_) {
-      double_builder_->TestApplySplit(tree);
-    } else {
-      float_builder_->TestApplySplit(tree);
-    }
-  }
 };
 
 TEST(QuantileHist, InitData) {
@@ -301,30 +151,62 @@ TEST(QuantileHist, InitData) {
   maker_float.TestInitData();
 }
 
-TEST(QuantileHist, InitDataSampling) {
-  const float subsample = 0.5;
-  std::vector<std::pair<std::string, std::string>> cfg
-      {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
-       {"subsample", std::to_string(subsample)}};
-  QuantileHistMock maker(cfg);
-  maker.TestInitDataSampling();
-  const bool single_precision_histogram = true;
-  QuantileHistMock maker_float(cfg, single_precision_histogram);
-  maker_float.TestInitDataSampling();
-}
-
-TEST(QuantileHist, ApplySplit) {
-  std::vector<std::pair<std::string, std::string>> cfg
-      {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())},
-       {"split_evaluator", "elastic_net"},
-       {"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"},
-       {"min_child_weight", "0"}};
-  QuantileHistMock maker(cfg);
-  maker.TestApplySplit();
-  const bool single_precision_histogram = true;
-  QuantileHistMock maker_float(cfg, single_precision_histogram);
-  maker_float.TestApplySplit();
+TEST(QuantileHist, Partitioner) {
+  size_t n_samples = 1024, n_features = 1, base_rowid = 0;
+  GenericParameter ctx;
+  ctx.InitAllowUnknown(Args{});
+
+  HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()};
+  ASSERT_EQ(partitioner.base_rowid, base_rowid);
+  ASSERT_EQ(partitioner.Size(), 1);
+  ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples);
+
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+
+  auto grad = GenerateRandomGradients(n_samples);
+  std::vector<float> hess(grad.Size());
+  std::transform(grad.HostVector().cbegin(), grad.HostVector().cend(), hess.begin(),
+                 [](auto gpair) { return gpair.GetHess(); });
+
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, 0.5})) {
+    bst_feature_t const split_ind = 0;
+    common::ColumnMatrix column_indices;
+    column_indices.Init(page, 0.5, ctx.Threads());
+    {
+      auto min_value = page.cut.MinValues()[split_ind];
+      RegTree tree;
+      HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()};
+      GetSplit(&tree, min_value, &candidates);
+      partitioner.UpdatePosition<false>(&ctx, page, column_indices, candidates, &tree);
+      ASSERT_EQ(partitioner.Size(), 3);
+      ASSERT_EQ(partitioner[1].Size(), 0);
+      ASSERT_EQ(partitioner[2].Size(), n_samples);
+    }
+    {
+      HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()};
+      auto ptr = page.cut.Ptrs()[split_ind + 1];
+      float split_value = page.cut.Values().at(ptr / 2);
+      RegTree tree;
+      GetSplit(&tree, split_value, &candidates);
+      auto left_nidx = tree[RegTree::kRoot].LeftChild();
+      partitioner.UpdatePosition<false>(&ctx, page, column_indices, candidates, &tree);
+
+      auto elem = partitioner[left_nidx];
+      ASSERT_LT(elem.Size(), n_samples);
+      ASSERT_GT(elem.Size(), 1);
+      for (auto it = elem.begin; it != elem.end; ++it) {
+        auto value = page.cut.Values().at(page.index[*it]);
+        ASSERT_LE(value, split_value);
+      }
+      auto right_nidx = tree[RegTree::kRoot].RightChild();
+      elem = partitioner[right_nidx];
+      for (auto it = elem.begin; it != elem.end; ++it) {
+        auto value = page.cut.Values().at(page.index[*it]);
+        ASSERT_GT(value, split_value) << *it;
+      }
+    }
+  }
 }
-
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 3e80d273899f..b73736c6950f 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -245,3 +245,4 @@ def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical(self, rows, cols, rounds, cats):
         self.run_categorical_basic(rows, cols, rounds, cats, "approx")
+        self.run_categorical_basic(rows, cols, rounds, cats, "hist")

From eba5252a9d5afce7f7b0ddb54fed5fdb355df2ef Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 23 Feb 2022 23:18:39 +0800
Subject: [PATCH 2/3] Fix ref.

---
 src/common/partition_builder.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 0fdde231f731..774126dd76f1 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -116,7 +116,7 @@ class PartitionBuilder {
 
     auto const& index = gmat.index;
     auto const& cut_values = gmat.cut.Values();
-    auto cut_ptrs = gmat.cut.Ptrs();
+    auto const& cut_ptrs = gmat.cut.Ptrs();
 
     auto pred = [&](auto ridx, auto bin_id) {
       bool go_left;

From d55fcc9473d74599761ee2581b65618809e20de9 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Thu, 24 Feb 2022 01:40:38 +0800
Subject: [PATCH 3/3] Small optimization.

---
 src/common/partition_builder.h       | 16 ++++++++++------
 src/tree/updater_quantile_hist.cc    |  9 +++++++--
 src/tree/updater_quantile_hist.h     |  8 ++++----
 tests/cpp/tree/test_quantile_hist.cc |  4 ++--
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 774126dd76f1..3250b9d2bf25 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -61,7 +61,11 @@ class PartitionBuilder {
     size_t nright_elems = 0;
     auto state = column.GetInitialState(row_indices.front() - base_rowid);
 
-    for (auto rid : row_indices) {
+    auto p_row_indices = row_indices.data();
+    auto n_samples = row_indices.size();
+
+    for (size_t i = 0; i < n_samples; ++i) {
+      auto rid = p_row_indices[i];
       const int32_t bin_id = column.GetBinIdx(rid - base_rowid, &state);
       if (any_missing && bin_id == ColumnType::kMissingId) {
         if (default_left) {
@@ -100,7 +104,7 @@ class PartitionBuilder {
     return {nleft_elems, nright_elems};
   }
 
-  template <typename BinIdxType, bool any_missing>
+  template <typename BinIdxType, bool any_missing, bool any_cat>
   void Partition(const size_t node_in_set, const size_t nid, const common::Range1d range,
                  const int32_t split_cond, GHistIndexMatrix const& gmat,
                  const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) {
@@ -119,8 +123,7 @@ class PartitionBuilder {
     auto const& cut_ptrs = gmat.cut.Ptrs();
 
     auto pred = [&](auto ridx, auto bin_id) {
-      bool go_left;
-      if (is_cat) {
+      if (any_cat && is_cat) {
         auto begin = gmat.RowIdx(ridx);
         auto end = gmat.RowIdx(ridx + 1);
         auto f_begin = cut_ptrs[fid];
@@ -128,15 +131,16 @@ class PartitionBuilder {
         // bypassing the column matrix as we need the cut value instead of bin idx for categorical
         // features.
         auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end);
+        bool go_left;
         if (gidx == -1) {
           go_left = default_left;
         } else {
           go_left = Decision(node_cats, cut_values[gidx], default_left);
         }
+        return go_left;
       } else {
-        go_left = bin_id <= split_cond;
+        return bin_id <= split_cond;
       }
-      return go_left;
     };
 
     std::pair<size_t, size_t> child_nodes_sizes;
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 60b501b7c4a4..cfc95039ea24 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -256,8 +256,13 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
 
     if (nodes_for_apply_split.size() != 0) {
       HistRowPartitioner &partitioner = this->partitioner_.front();
-      partitioner.UpdatePosition<any_missing>(this->ctx_, gmat, column_matrix,
-                                              nodes_for_apply_split, p_tree);
+      if (gmat.cut.HasCategorical()) {
+        partitioner.UpdatePosition<any_missing, true>(this->ctx_, gmat, column_matrix,
+                                                      nodes_for_apply_split, p_tree);
+      } else {
+        partitioner.UpdatePosition<any_missing, false>(this->ctx_, gmat, column_matrix,
+                                                       nodes_for_apply_split, p_tree);
+      }
 
       SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree);
 
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
index 4213d5dc8862..1c863028c53b 100644
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -126,7 +126,7 @@ class HistRowPartitioner {
     this->base_rowid = base_rowid;
   }
 
-  template <bool any_missing>
+  template <bool any_missing, bool any_cat>
   void UpdatePosition(GenericParameter const* ctx, GHistIndexMatrix const& gmat,
                       common::ColumnMatrix const& column_matrix,
                       std::vector<CPUExpandEntry> const& nodes, RegTree const* p_tree) {
@@ -160,17 +160,17 @@ class HistRowPartitioner {
       partition_builder_.AllocateForTask(task_id);
       switch (column_matrix.GetTypeSize()) {
         case common::kUint8BinsTypeSize:
-          partition_builder_.template Partition<uint8_t, any_missing>(
+          partition_builder_.template Partition<uint8_t, any_missing, any_cat>(
               node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree,
               row_set_collection_[nid].begin);
           break;
         case common::kUint16BinsTypeSize:
-          partition_builder_.template Partition<uint16_t, any_missing>(
+          partition_builder_.template Partition<uint16_t, any_missing, any_cat>(
               node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree,
               row_set_collection_[nid].begin);
           break;
         case common::kUint32BinsTypeSize:
-          partition_builder_.template Partition<uint32_t, any_missing>(
+          partition_builder_.template Partition<uint32_t, any_missing, any_cat>(
               node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree,
               row_set_collection_[nid].begin);
           break;
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index d2bebf0616d1..4a26496cb80f 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -178,7 +178,7 @@ TEST(QuantileHist, Partitioner) {
       RegTree tree;
       HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()};
       GetSplit(&tree, min_value, &candidates);
-      partitioner.UpdatePosition<false>(&ctx, page, column_indices, candidates, &tree);
+      partitioner.UpdatePosition<false, true>(&ctx, page, column_indices, candidates, &tree);
       ASSERT_EQ(partitioner.Size(), 3);
       ASSERT_EQ(partitioner[1].Size(), 0);
       ASSERT_EQ(partitioner[2].Size(), n_samples);
@@ -190,7 +190,7 @@ TEST(QuantileHist, Partitioner) {
       RegTree tree;
       GetSplit(&tree, split_value, &candidates);
       auto left_nidx = tree[RegTree::kRoot].LeftChild();
-      partitioner.UpdatePosition<false>(&ctx, page, column_indices, candidates, &tree);
+      partitioner.UpdatePosition<false, true>(&ctx, page, column_indices, candidates, &tree);
 
       auto elem = partitioner[left_nidx];
       ASSERT_LT(elem.Size(), n_samples);