From 9f68393676669f2df62d90f2f8ac92b6c453e232 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 30 Aug 2022 03:36:17 +0800
Subject: [PATCH 01/19] Initial commit.

---
 src/tree/gpu_hist/evaluate_splits.cu  | 83 +++++++++++++++++----------
 src/tree/hist/evaluate_splits.h       |  2 +-
 tests/python-gpu/test_gpu_updaters.py |  3 +-
 tests/python/test_updaters.py         |  5 +-
 4 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 908e7b01972c..3d6fa2dfa024 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -167,35 +167,34 @@ class EvaluateSplitAgent {
       }
     }
   }
+
   __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
-                                         bst_feature_t * __restrict__ sorted_idx,
-                                         std::size_t offset) {
+                                            bst_feature_t *__restrict__ sorted_idx,
+                                            std::size_t offset) {
     for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
 
-      auto rest = thread_active
-                      ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset)
-                      : GradientPairPrecise();
+      auto right_sum =
+          thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset)
+                        : GradientPairPrecise();
       // No min value for cat feature, use inclusive scan.
-      BlockScanT(temp_storage->scan).InclusiveSum(rest, rest,  prefix_op);
-      GradientPairPrecise bin = parent_sum - rest - missing;
+      BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op);
+      GradientPairPrecise left_sum = parent_sum - right_sum - missing;
 
       // Whether the gradient of missing values is put to the left side.
       bool missing_left = true;
-      float gain = thread_active ? LossChangeMissing(bin, missing, parent_sum, param, nidx, fidx,
-                                                     evaluator, missing_left)
+      float gain = thread_active ? LossChangeMissing(left_sum, missing, parent_sum, param, nidx,
+                                                     fidx, evaluator, missing_left)
                                  : kNullGain;
 
-
       // Find thread with best gain
-      auto best =
-          MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
       // This reduce result is only valid in thread 0
       // broadcast to the rest of the warp
       auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
       // Best thread updates the split
       if (threadIdx.x == best_thread) {
-        GradientPairPrecise left = missing_left ? bin + missing : bin;
+        GradientPairPrecise left = missing_left ? left_sum + missing : left_sum;
         GradientPairPrecise right = parent_sum - left;
         auto best_thresh =
             threadIdx.x + (scan_begin - gidx_begin);  // index of best threshold inside a feature.
@@ -277,32 +276,54 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
     return;
   }
 
+  // partition-based split
   auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx,
                                               shared_inputs.feature_values.size());
   size_t node_offset = input_idx * shared_inputs.feature_values.size();
   auto best_thresh = out_split.PopBestThresh();
   auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx],
                                               shared_inputs.FeatureBins(fidx));
-  if (out_split.dir != kLeftDir) {
-    // forward, missing on right
-    auto beg = dh::tcbegin(f_sorted_idx);
-    // Don't put all the categories into one side
-    auto boundary = std::min(static_cast<size_t>((best_thresh + 1)), (f_sorted_idx.size() - 1));
-    boundary = std::max(boundary, static_cast<size_t>(1ul));
-    auto end = beg + boundary;
-    thrust::for_each(thrust::seq, beg, end, [&](auto c) {
-      auto cat = shared_inputs.feature_values[c - node_offset];
-      assert(!out_split.split_cats.Check(cat) && "already set");
-      out_split.SetCat(cat);
-    });
+  bool forward = out_split.dir == kLeftDir;
+  auto cut_ptr = shared_inputs.feature_segments;
+
+  auto n_bins = shared_inputs.FeatureBins(fidx);
+  bst_bin_t ibegin, iend;
+  bst_bin_t f_begin = cut_ptr[fidx];
+  if (forward) {
+    ibegin = f_begin;
+    iend = ibegin + n_bins - 1;
   } else {
-    assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0");
-    thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx),
-                     dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) {
-                       auto cat = shared_inputs.feature_values[c - node_offset];
-                       out_split.SetCat(cat);
-                     });
+    ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
+    iend = ibegin - n_bins + 1;
   }
+
+  bst_bin_t partition = forward ? (best_thresh - ibegin + 1) : (best_thresh - f_begin);
+  auto beg = dh::tcbegin(f_sorted_idx);
+  thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
+    auto cat = shared_inputs.feature_values[c - node_offset];
+    out_split.SetCat(cat);
+  });
+
+  // if (out_split.dir != kLeftDir) {
+  //   // backward, missing on right
+  //   auto beg = dh::tcbegin(f_sorted_idx);
+  //   // Don't put all the categories into one side
+  //   auto boundary = std::min(static_cast<size_t>((best_thresh + 1)), (f_sorted_idx.size() - 1));
+  //   boundary = std::max(boundary, static_cast<size_t>(1ul));
+  //   auto end = beg + boundary;
+  //   thrust::for_each(thrust::seq, beg, end, [&](auto c) {
+  //     auto cat = shared_inputs.feature_values[c - node_offset];
+  //     assert(!out_split.split_cats.Check(cat) && "already set");
+  //     out_split.SetCat(cat);
+  //   });
+  // } else {
+  //   assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0");
+  //   thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx),
+  //                    dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) {
+  //                      auto cat = shared_inputs.feature_values[c - node_offset];
+  //                      out_split.SetCat(cat);
+  //                    });
+  // }
 }
 
 void GPUHistEvaluator::LaunchEvaluateSplits(
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 002728be365f..adb7e54decf9 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -193,7 +193,7 @@ class HistEvaluator {
       bst_bin_t partition = d_step == 1 ? (best_thresh - ibegin + 1) : (best_thresh - f_begin);
       CHECK_GT(partition, 0);
       std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition,
-                    [&](size_t c) { cat_bits.Set(c); });
+                    [&](size_t c) { cat_bits.Set(c); });  // fixme: cut_values[c]
     }
 
     p_best->Update(best);
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index b2f8b1a27050..0b086f0e9282 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -3,7 +3,7 @@
 import gc
 import pytest
 import xgboost as xgb
-from hypothesis import given, strategies, assume, settings, note
+from hypothesis import given, strategies, assume, settings, note, reproduce_failure
 
 sys.path.append("tests/python")
 import testing as tm
@@ -84,6 +84,7 @@ def test_categorical_ohe(self, rows, cols, rounds, cats):
     )
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
+    @reproduce_failure('6.47.1', b'AAAAAAA=')
     def test_categorical_missing(self, rows, cols, cats):
         self.cputest.run_categorical_missing(rows, cols, cats, "gpu_hist")
 
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 93dfb45c6b45..881439292a06 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -257,10 +257,7 @@ def run(max_cat_to_onehot: int):
             np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])
 
         # Test with OHE split
-        run(self.USE_ONEHOT)
-
-        if tree_method == "gpu_hist":  # fixme: Test with GPU.
-            return
+        # run(self.USE_ONEHOT)
 
         # Test with partition-based split
         run(self.USE_PART)

From c33d9fc270488bf7196ee5d612cd956925673380 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 30 Aug 2022 17:48:41 +0800
Subject: [PATCH 02/19] Small cleanup to CPU.

---
 src/tree/hist/evaluate_splits.h | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index adb7e54decf9..644eadc30f81 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -144,7 +144,10 @@ class HistEvaluator {
 
     auto const &cut_ptr = cut.Ptrs();
     auto const &parent = snode_[nidx];
-    bst_bin_t n_bins_feature{static_cast<bst_bin_t>(cut_ptr[fidx + 1] - cut_ptr[fidx])};
+
+    bst_bin_t f_begin = cut_ptr[fidx];
+    bst_bin_t f_end = cut_ptr[fidx + 1];
+    bst_bin_t n_bins_feature{f_end - f_begin};
     auto n_bins = std::min(param_.max_cat_threshold, n_bins_feature);
 
     // statistics on both sides of split
@@ -153,19 +156,18 @@ class HistEvaluator {
     // best split so far
     SplitEntry best;
 
-    auto f_hist = hist.subspan(cut_ptr[fidx], n_bins_feature);
-    bst_bin_t ibegin, iend;
-    bst_bin_t f_begin = cut_ptr[fidx];
+    auto f_hist = hist.subspan(f_begin, n_bins_feature);
+    bst_bin_t it_begin, it_end;
     if (d_step > 0) {
-      ibegin = f_begin;
-      iend = ibegin + n_bins - 1;
+      it_begin = f_begin;
+      it_end = it_begin + n_bins - 1;
     } else {
-      ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
-      iend = ibegin - n_bins + 1;
+      it_begin = f_end - 1;
+      it_end = it_begin - n_bins + 1;
     }
 
     bst_bin_t best_thresh{-1};
-    for (bst_bin_t i = ibegin; i != iend; i += d_step) {
+    for (bst_bin_t i = it_begin; i != it_end; i += d_step) {
       auto j = i - f_begin;  // index local to current feature
       if (d_step == 1) {
         right_sum.Add(f_hist[sorted_idx[j]].GetGrad(), f_hist[sorted_idx[j]].GetHess());
@@ -190,7 +192,7 @@ class HistEvaluator {
       auto n = common::CatBitField::ComputeStorageSize(n_bins_feature + 1);
       best.cat_bits = decltype(best.cat_bits)(n, 0);
       common::CatBitField cat_bits{best.cat_bits};
-      bst_bin_t partition = d_step == 1 ? (best_thresh - ibegin + 1) : (best_thresh - f_begin);
+      bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin);
       CHECK_GT(partition, 0);
       std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition,
                     [&](size_t c) { cat_bits.Set(c); });  // fixme: cut_values[c]

From 8397bb0d52974985461239a05fd1f1b3c529384e Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 30 Aug 2022 19:14:46 +0800
Subject: [PATCH 03/19] Specialize on cat update.

---
 src/tree/gpu_hist/evaluate_splits.cu  | 73 +++++++++++----------------
 src/tree/gpu_hist/evaluate_splits.cuh |  3 +-
 src/tree/updater_gpu_common.cuh       | 39 ++++++++------
 src/tree/updater_gpu_hist.cu          | 12 +++--
 4 files changed, 62 insertions(+), 65 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 3d6fa2dfa024..077cd795d68e 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -43,9 +43,9 @@ class EvaluateSplitAgent {
  public:
   using ArgMaxT = cub::KeyValuePair<int, float>;
   using BlockScanT = cub::BlockScan<GradientPairPrecise, kBlockSize>;
-  using MaxReduceT =
-      cub::WarpReduce<ArgMaxT>;
+  using MaxReduceT = cub::WarpReduce<ArgMaxT>;
   using SumReduceT = cub::WarpReduce<GradientPairPrecise>;
+
   struct TempStorage {
     typename BlockScanT::TempStorage scan;
     typename MaxReduceT::TempStorage max_reduce;
@@ -159,11 +159,10 @@ class EvaluateSplitAgent {
       if (threadIdx.x == best_thread) {
         int32_t split_gidx = (scan_begin + threadIdx.x);
         float fvalue = feature_values[split_gidx];
-        GradientPairPrecise left =
-            missing_left ? bin + missing : bin;
+        GradientPairPrecise left = missing_left ? bin + missing : bin;
         GradientPairPrecise right = parent_sum - left;
-        best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right,
-                           true, param);
+        best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir,
+                              static_cast<bst_cat_t>(fvalue), fidx, left, right, param);
       }
     }
   }
@@ -171,7 +170,7 @@ class EvaluateSplitAgent {
   __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
                                             bst_feature_t *__restrict__ sorted_idx,
                                             std::size_t offset) {
-    for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
+    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
       bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
 
       auto right_sum =
@@ -187,6 +186,18 @@ class EvaluateSplitAgent {
                                                      fidx, evaluator, missing_left)
                                  : kNullGain;
 
+      auto const thresh = threadIdx.x + scan_begin;
+      auto const &forward = missing_left;
+      auto const &backward = !forward;
+      // prevent splitting on missing value alone, skip the last bin if it's forward scan
+      // and skip the first bin if it's backward
+      auto const first_bin = gidx_begin;
+      auto const last_bin = gidx_end - 1;
+      bool const should_skip = (forward && thresh == last_bin) || (backward && thresh == first_bin);
+      if (should_skip) {
+        gain = kNullGain;
+      }
+
       // Find thread with best gain
       auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
       // This reduce result is only valid in thread 0
@@ -196,10 +207,10 @@ class EvaluateSplitAgent {
       if (threadIdx.x == best_thread) {
         GradientPairPrecise left = missing_left ? left_sum + missing : left_sum;
         GradientPairPrecise right = parent_sum - left;
-        auto best_thresh =
-            threadIdx.x + (scan_begin - gidx_begin);  // index of best threshold inside a feature.
-        best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left,
-                           right, true, param);
+        // index of best threshold inside a feature.
+        auto best_thresh = threadIdx.x + (scan_begin - gidx_begin);
+        auto dft_dir = missing_left ? kLeftDir : kRightDir;
+        best_split->UpdateCat(gain, dft_dir, best_thresh, fidx, left, right, param);
       }
     }
   }
@@ -272,7 +283,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
 
   // Simple case for one hot split
   if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) {
-    out_split.split_cats.Set(common::AsCat(out_split.fvalue));
+    out_split.split_cats.Set(common::AsCat(out_split.thresh));
     return;
   }
 
@@ -280,50 +291,26 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
   auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx,
                                               shared_inputs.feature_values.size());
   size_t node_offset = input_idx * shared_inputs.feature_values.size();
-  auto best_thresh = out_split.PopBestThresh();
+  auto best_thresh = out_split.thresh;
+  if (best_thresh == -1) {
+    return;
+  }
+
   auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx],
                                               shared_inputs.FeatureBins(fidx));
   bool forward = out_split.dir == kLeftDir;
   auto cut_ptr = shared_inputs.feature_segments;
 
   auto n_bins = shared_inputs.FeatureBins(fidx);
-  bst_bin_t ibegin, iend;
   bst_bin_t f_begin = cut_ptr[fidx];
-  if (forward) {
-    ibegin = f_begin;
-    iend = ibegin + n_bins - 1;
-  } else {
-    ibegin = static_cast<bst_bin_t>(cut_ptr[fidx + 1]) - 1;
-    iend = ibegin - n_bins + 1;
-  }
 
-  bst_bin_t partition = forward ? (best_thresh - ibegin + 1) : (best_thresh - f_begin);
+  bst_bin_t partition = forward ? best_thresh + 1 : best_thresh;
   auto beg = dh::tcbegin(f_sorted_idx);
+  assert(partition > 0 && "Invalid partition.");
   thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) {
     auto cat = shared_inputs.feature_values[c - node_offset];
     out_split.SetCat(cat);
   });
-
-  // if (out_split.dir != kLeftDir) {
-  //   // backward, missing on right
-  //   auto beg = dh::tcbegin(f_sorted_idx);
-  //   // Don't put all the categories into one side
-  //   auto boundary = std::min(static_cast<size_t>((best_thresh + 1)), (f_sorted_idx.size() - 1));
-  //   boundary = std::max(boundary, static_cast<size_t>(1ul));
-  //   auto end = beg + boundary;
-  //   thrust::for_each(thrust::seq, beg, end, [&](auto c) {
-  //     auto cat = shared_inputs.feature_values[c - node_offset];
-  //     assert(!out_split.split_cats.Check(cat) && "already set");
-  //     out_split.SetCat(cat);
-  //   });
-  // } else {
-  //   assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0");
-  //   thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx),
-  //                    dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) {
-  //                      auto cat = shared_inputs.feature_values[c - node_offset];
-  //                      out_split.SetCat(cat);
-  //                    });
-  // }
 }
 
 void GPUHistEvaluator::LaunchEvaluateSplits(
diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
index 3e3e51b8e7c2..6b50de0ce3de 100644
--- a/src/tree/gpu_hist/evaluate_splits.cuh
+++ b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -141,7 +141,8 @@ class GPUHistEvaluator {
    */
   common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
     copy_stream_.View().Sync();
-    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
+    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(
+        nidx * node_categorical_storage_size_, node_categorical_storage_size_);
     return cats_out;
   }
   /**
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index c7c81e964848..b9a31dd32477 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -57,6 +57,9 @@ struct DeviceSplitCandidate {
   DefaultDirection dir {kLeftDir};
   int findex {-1};
   float fvalue {0};
+  // categorical split, either it's the split category for OHE or the threshold for partition-based
+  // split.
+  bst_cat_t thresh{-1};
 
   common::CatBitField split_cats;
   bool is_cat { false };
@@ -75,22 +78,6 @@ struct DeviceSplitCandidate {
       *this = other;
     }
   }
-  /**
-   * \brief The largest encoded category in the split bitset
-   */
-  bst_cat_t MaxCat() const {
-    // Reuse the fvalue for categorical values.
-    return static_cast<bst_cat_t>(fvalue);
-  }
-  /**
-   * \brief Return the best threshold for cat split, reset the value after return.
-   */
-  XGBOOST_DEVICE size_t PopBestThresh() {
-    // fvalue is also being used for storing the threshold for categorical split
-    auto best_thresh = static_cast<size_t>(this->fvalue);
-    this->fvalue = 0;
-    return best_thresh;
-  }
 
   template <typename T>
   XGBOOST_DEVICE void SetCat(T c) {
@@ -116,6 +103,26 @@ struct DeviceSplitCandidate {
       findex = findex_in;
     }
   }
+
+  /**
+   * \brief Update for partition-based splits.
+   */
+  XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in,
+                                bst_feature_t findex_in, GradientPairPrecise left_sum_in,
+                                GradientPairPrecise right_sum_in, GPUTrainingParam const& param) {
+    if (loss_chg_in > loss_chg && left_sum_in.GetHess() >= param.min_child_weight &&
+        right_sum_in.GetHess() >= param.min_child_weight) {
+      loss_chg = loss_chg_in;
+      dir = dir_in;
+      fvalue = std::numeric_limits<float>::quiet_NaN();
+      thresh = thresh_in;
+      is_cat = true;
+      left_sum = left_sum_in;
+      right_sum = right_sum_in;
+      findex = findex_in;
+    }
+  }
+
   XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; }
 
   friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 3f3137c58adc..bafe800cd98b 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -601,13 +601,14 @@ struct GPUHistMakerDevice {
 
     auto is_cat = candidate.split.is_cat;
     if (is_cat) {
-      CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
-          << "Categorical feature value too large.";
-      std::vector<uint32_t> split_cats;
+      // should be set to nan in evaluation split.
+      CHECK(common::CheckNAN(candidate.split.fvalue));
+      std::vector<common::CatBitField::value_type> split_cats;
+
       CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
       auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
-      auto max_cat = candidate.split.MaxCat();
-      split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
+      auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
+      split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature + 1), 0);
       CHECK_LE(split_cats.size(), h_cats.size());
       std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
 
@@ -616,6 +617,7 @@ struct GPUHistMakerDevice {
           base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(),
           candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess());
     } else {
+      CHECK(!common::CheckNAN(candidate.split.fvalue));
       tree.ExpandNode(candidate.nid, candidate.split.findex, candidate.split.fvalue,
                       candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight,
                       candidate.split.loss_chg, parent_sum.GetHess(),

From dcfdbcafe8a0f4204b033b6fa2523a602afeb16e Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 30 Aug 2022 23:28:35 +0800
Subject: [PATCH 04/19] Dense, tie breaking.

---
 src/tree/gpu_hist/evaluate_splits.cu          |  16 ++-
 src/tree/hist/evaluate_splits.h               |   2 +
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 122 +++++++++++++++++-
 tests/cpp/tree/test_evaluate_splits.h         |  16 ++-
 tests/python-gpu/test_gpu_updaters.py         |   2 +-
 tests/python/test_updaters.py                 |  15 ++-
 6 files changed, 158 insertions(+), 15 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 077cd795d68e..aaab931ff3dc 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -27,7 +27,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan,
       evaluator.CalcSplitGain(param, nidx, fidx, left_sum, parent_sum - left_sum);
   float missing_right_gain = evaluator.CalcSplitGain(param, nidx, fidx, scan, parent_sum - scan);
 
-  missing_left_out = missing_left_gain > missing_right_gain;
+  missing_left_out = missing_left_gain >= missing_right_gain;
   return missing_left_out?missing_left_gain:missing_right_gain;
 }
 
@@ -176,6 +176,7 @@ class EvaluateSplitAgent {
       auto right_sum =
           thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset)
                         : GradientPairPrecise();
+
       // No min value for cat feature, use inclusive scan.
       BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op);
       GradientPairPrecise left_sum = parent_sum - right_sum - missing;
@@ -185,6 +186,7 @@ class EvaluateSplitAgent {
       float gain = thread_active ? LossChangeMissing(left_sum, missing, parent_sum, param, nidx,
                                                      fidx, evaluator, missing_left)
                                  : kNullGain;
+      // printf("l: %f, r: %f, gain: %f\n", left_sum.GetHess(), right_sum.GetHess(), gain);
 
       auto const thresh = threadIdx.x + scan_begin;
       auto const &forward = missing_left;
@@ -210,6 +212,7 @@ class EvaluateSplitAgent {
         // index of best threshold inside a feature.
         auto best_thresh = threadIdx.x + (scan_begin - gidx_begin);
         auto dft_dir = missing_left ? kLeftDir : kRightDir;
+        // printf("best_thresh: %d\n", static_cast<int32_t>(best_thresh));
         best_split->UpdateCat(gain, dft_dir, best_thresh, fidx, left, right, param);
       }
     }
@@ -291,18 +294,17 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
   auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx,
                                               shared_inputs.feature_values.size());
   size_t node_offset = input_idx * shared_inputs.feature_values.size();
-  auto best_thresh = out_split.thresh;
+  auto const best_thresh = out_split.thresh;
   if (best_thresh == -1) {
+    printf("Invalid split\n");
     return;
   }
-
   auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx],
                                               shared_inputs.FeatureBins(fidx));
+  // bool forward = true;
   bool forward = out_split.dir == kLeftDir;
-  auto cut_ptr = shared_inputs.feature_segments;
-
-  auto n_bins = shared_inputs.FeatureBins(fidx);
-  bst_bin_t f_begin = cut_ptr[fidx];
+  // printf("best_thresh setter: %d, forward: %d\n", static_cast<int32_t>(best_thresh),
+  //        static_cast<int32_t>(forward));
 
   bst_bin_t partition = forward ? best_thresh + 1 : best_thresh;
   auto beg = dh::tcbegin(f_sorted_idx);
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 644eadc30f81..b624152eecfd 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -196,6 +196,8 @@ class HistEvaluator {
       CHECK_GT(partition, 0);
       std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition,
                     [&](size_t c) { cat_bits.Set(c); });  // fixme: cut_values[c]
+      // printf("best_thresh setter: %d, forward: %d\n", static_cast<int32_t>(best_thresh),
+      //        static_cast<int32_t>(d_step == 1));
     }
 
     p_best->Update(best);
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 28b69122deae..2837036df32d 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -19,6 +19,125 @@ auto ZeroParam() {
 }
 }  // anonymous namespace
 
+TEST(GpuHist, PartitionBasic) {
+  TrainParam tparam = ZeroParam();
+  tparam.max_cat_to_onehot = 0;
+  GPUTrainingParam param{tparam};
+
+  common::HistogramCuts cuts;
+  cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
+  cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
+  cuts.min_vals_.HostVector() = std::vector<float>{0.0};
+  cuts.cut_ptrs_.SetDevice(0);
+  cuts.cut_values_.SetDevice(0);
+  cuts.min_vals_.SetDevice(0);
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
+
+  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  common::Span<FeatureType> d_feature_types;
+  auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end());
+  cuts.SetCategorical(true, max_cat);
+  d_feature_types = dh::ToSpan(feature_types);
+
+  EvaluateSplitSharedInputs shared_inputs{
+      param,
+      d_feature_types,
+      cuts.cut_ptrs_.ConstDeviceSpan(),
+      cuts.cut_values_.ConstDeviceSpan(),
+      cuts.min_vals_.ConstDeviceSpan(),
+  };
+
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+
+  {
+    // -1.0s go left
+    // -3.0s go right
+    GradientPairPrecise parent_sum(-5.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};
+    EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+
+  // {
+  //   // -1.0s go left
+  //   // -3.0s go right
+  //   GradientPairPrecise parent_sum(-7.0, 3.0);
+  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
+  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}};
+  //   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+  //                             dh::ToSpan(feature_histogram)};
+  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+  //   EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  // }
+  // {
+  //   // All -1.0, gain from splitting should be 0.0
+  //   GradientPairPrecise parent_sum(-3.0, 3.0);
+  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
+  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}};
+  //   EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set),
+  //                             dh::ToSpan(feature_histogram)};
+  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  //   EXPECT_FLOAT_EQ(result.loss_chg, 0.0f);
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  // }
+  // With 3.0/3.0 missing values
+  // All categories go left
+  // missing values go right
+  // {
+  //   GradientPairPrecise parent_sum(0.0, 6.0);
+  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
+  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}};
+  //   EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
+  //                             dh::ToSpan(feature_histogram)};
+  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+  //   EXPECT_EQ(cats, std::bitset<32>("11100000000000000000000000000000"));
+  //   EXPECT_EQ(result.dir, kRightDir);
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  // }
+  // {
+  //   // -1.0s go left
+  //   // -3.0s go right
+  //   GradientPairPrecise parent_sum(-5.0, 3.0);
+  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
+  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}};
+  //   EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set),
+  //                             dh::ToSpan(feature_histogram)};
+  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+  //   EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000"));
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  // }
+  // {
+  //   // -1.0s go left
+  //   // -3.0s go right
+  //   GradientPairPrecise parent_sum(-5.0, 3.0);
+  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
+  //       std::vector<GradientPairPrecise>{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};
+  //   EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set),
+  //                             dh::ToSpan(feature_histogram)};
+  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+  //   EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000"));
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  // }
+}
+
 void TestEvaluateSingleSplit(bool is_categorical) {
   GradientPairPrecise parent_sum(0.0, 1.0);
   TrainParam tparam = ZeroParam();
@@ -263,8 +382,7 @@ TEST(GpuHist, EvaluateSplits) {
 
 TEST_F(TestPartitionBasedSplit, GpuHist) {
   dh::device_vector<FeatureType> ft{std::vector<FeatureType>{FeatureType::kCategorical}};
-  GPUHistEvaluator evaluator{param_,
-                                                  static_cast<bst_feature_t>(info_.num_col_), 0};
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(info_.num_col_), 0};
 
   cuts_.cut_ptrs_.SetDevice(0);
   cuts_.cut_values_.SetDevice(0);
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 7925e40cdb26..7db556659e2b 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -43,7 +43,7 @@ class TestPartitionBasedSplit : public ::testing::Test {
     auto &h_vals = cuts_.cut_values_.HostVector();
     h_vals.resize(n_bins_);
     std::iota(h_vals.begin(), h_vals.end(), 0.0);
-    
+
     cuts_.min_vals_.Resize(1);
 
     hist_.Init(cuts_.TotalBins());
@@ -97,5 +97,19 @@ class TestPartitionBasedSplit : public ::testing::Test {
     } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end()));
   }
 };
+
+class TestPartitionBasedEvaluation : public testing::Test {
+  common::HistogramCuts cuts;
+
+ public:
+  void SetUp() override {
+    cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
+    cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
+    cuts.min_vals_.HostVector() = std::vector<float>{0.0};
+
+    auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end());
+    cuts.SetCategorical(true, max_cat);
+  }
+};
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 0b086f0e9282..afb3dbe5cf85 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -84,7 +84,7 @@ def test_categorical_ohe(self, rows, cols, rounds, cats):
     )
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
-    @reproduce_failure('6.47.1', b'AAAAAAA=')
+    @reproduce_failure('6.47.1', b'AACGAQE=')
     def test_categorical_missing(self, rows, cols, cats):
         self.cputest.run_categorical_missing(rows, cols, cats, "gpu_hist")
 
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 881439292a06..4b864e447f8e 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -5,7 +5,7 @@
 import pytest
 import xgboost as xgb
 import numpy as np
-from hypothesis import given, strategies, settings, note
+from hypothesis import given, strategies, settings, note, reproduce_failure
 
 exact_parameter_strategy = strategies.fixed_dictionaries({
     'nthread': strategies.integers(1, 4),
@@ -234,7 +234,7 @@ def run_categorical_missing(
     ) -> None:
         parameters: Dict[str, Any] = {"tree_method": tree_method}
         cat, label = tm.make_categorical(
-            n_samples=256, n_features=4, n_categories=8, onehot=False, sparsity=0.5
+            n_samples=rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5
         )
         Xy = xgb.DMatrix(cat, label, enable_categorical=True)
 
@@ -246,10 +246,16 @@ def run(max_cat_to_onehot: int):
             booster = xgb.train(
                 parameters,
                 Xy,
-                num_boost_round=16,
+                num_boost_round=8,
                 evals=[(Xy, "Train")],
                 evals_result=evals_result
             )
+            import json
+            booster.save_model(f"{tree_method}.json")
+            with open(f"{tree_method}.json", "r") as fd:
+                model = json.load(fd)
+            with open(f"{tree_method}.json", "w") as fd:
+                json.dump(model, fd, indent=2)
             assert tm.non_increasing(evals_result["Train"]["rmse"])
             y_predt = booster.predict(Xy)
 
@@ -394,6 +400,7 @@ def test_categorical_ames_housing(
     )
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
+    @reproduce_failure('6.47.1', b'AACGAQE=')
     def test_categorical_missing(self, rows, cols, cats):
-        self.run_categorical_missing(rows, cols, cats, "approx")
+        # self.run_categorical_missing(rows, cols, cats, "approx")
         self.run_categorical_missing(rows, cols, cats, "hist")

From 0e4489a5b0e8367dae8bb706fe27fbc66cd5c597 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 03:44:54 +0800
Subject: [PATCH 05/19] Scan.

---
 src/tree/gpu_hist/evaluate_splits.cu | 75 +++++++++++++++-------------
 src/tree/hist/evaluate_splits.h      |  2 -
 2 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index aaab931ff3dc..58e5e41d140e 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -27,7 +27,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan,
       evaluator.CalcSplitGain(param, nidx, fidx, left_sum, parent_sum - left_sum);
   float missing_right_gain = evaluator.CalcSplitGain(param, nidx, fidx, scan, parent_sum - scan);
 
-  missing_left_out = missing_left_gain >= missing_right_gain;
+  missing_left_out = missing_left_gain > missing_right_gain;
   return missing_left_out?missing_left_gain:missing_right_gain;
 }
 
@@ -168,10 +168,10 @@ class EvaluateSplitAgent {
   }
 
   __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
-                                            bst_feature_t *__restrict__ sorted_idx,
+                                            common::Span<bst_feature_t> sorted_idx,
                                             std::size_t offset) {
-    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) {
-      bool thread_active = (scan_begin + threadIdx.x) < gidx_end;
+    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end - 1; scan_begin += kBlockSize) {
+      bool thread_active = (scan_begin + threadIdx.x) < (gidx_end - 1);
 
       auto right_sum =
           thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset)
@@ -179,26 +179,39 @@ class EvaluateSplitAgent {
 
       // No min value for cat feature, use inclusive scan.
       BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op);
-      GradientPairPrecise left_sum = parent_sum - right_sum - missing;
+      GradientPairPrecise left_sum = parent_sum - right_sum;
+      auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum)
+                                : kNullGain;
 
-      // Whether the gradient of missing values is put to the left side.
-      bool missing_left = true;
-      float gain = thread_active ? LossChangeMissing(left_sum, missing, parent_sum, param, nidx,
-                                                     fidx, evaluator, missing_left)
-                                 : kNullGain;
-      // printf("l: %f, r: %f, gain: %f\n", left_sum.GetHess(), right_sum.GetHess(), gain);
-
-      auto const thresh = threadIdx.x + scan_begin;
-      auto const &forward = missing_left;
-      auto const &backward = !forward;
-      // prevent splitting on missing value alone, skip the last bin if it's forward scan
-      // and skip the first bin if it's backward
-      auto const first_bin = gidx_begin;
-      auto const last_bin = gidx_end - 1;
-      bool const should_skip = (forward && thresh == last_bin) || (backward && thresh == first_bin);
-      if (should_skip) {
-        gain = kNullGain;
+      // Find thread with best gain
+      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+      // This reduce result is only valid in thread 0
+      // broadcast to the rest of the warp
+      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+      // Best thread updates the split
+      if (threadIdx.x == best_thread) {
+        // index of best threshold inside a feature.
+        auto best_thresh = threadIdx.x + (scan_begin - gidx_begin);
+        best_split->UpdateCat(gain, kLeftDir, best_thresh, fidx, left_sum, right_sum, param);
       }
+    }
+    cub::CTA_SYNC();
+
+    // backward
+    bst_bin_t n_bins = gidx_end - gidx_begin;
+    bst_bin_t it_begin = gidx_end - 1;
+    bst_bin_t it_end = it_begin - n_bins + 1;
+    SumCallbackOp<GradientPairPrecise> backward_op;
+    for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) {
+      auto it = scan_begin - static_cast<bst_bin_t>(threadIdx.x);
+      bool thread_active = it > it_end;
+      auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset)
+                                    : GradientPairPrecise();
+      BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, backward_op);
+      GradientPairPrecise right_sum = parent_sum - left_sum;
+
+      auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum)
+                                : kNullGain;
 
       // Find thread with best gain
       auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
@@ -207,13 +220,12 @@ class EvaluateSplitAgent {
       auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
       // Best thread updates the split
       if (threadIdx.x == best_thread) {
-        GradientPairPrecise left = missing_left ? left_sum + missing : left_sum;
-        GradientPairPrecise right = parent_sum - left;
+        assert(thread_active);
         // index of best threshold inside a feature.
-        auto best_thresh = threadIdx.x + (scan_begin - gidx_begin);
-        auto dft_dir = missing_left ? kLeftDir : kRightDir;
-        // printf("best_thresh: %d\n", static_cast<int32_t>(best_thresh));
-        best_split->UpdateCat(gain, dft_dir, best_thresh, fidx, left, right, param);
+        // auto best_thresh = (it_begin - scan_begin) - static_cast<int32_t>(threadIdx.x);
+        auto best_thresh = scan_begin - static_cast<int32_t>(threadIdx.x) - gidx_begin;
+        assert(best_thresh > 0);
+        best_split->UpdateCat(gain, kRightDir, best_thresh, fidx, left_sum, right_sum, param);
       }
     }
   }
@@ -255,7 +267,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
       auto total_bins = shared_inputs.feature_values.size();
       size_t offset = total_bins * input_idx;
       auto node_sorted_idx = sorted_idx.subspan(offset, total_bins);
-      agent.Partition(&best_split, node_sorted_idx.data(), offset);
+      agent.Partition(&best_split, node_sorted_idx, offset);
     }
   } else {
     agent.Numerical(&best_split);
@@ -296,16 +308,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu
   size_t node_offset = input_idx * shared_inputs.feature_values.size();
   auto const best_thresh = out_split.thresh;
   if (best_thresh == -1) {
-    printf("Invalid split\n");
     return;
   }
   auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx],
                                               shared_inputs.FeatureBins(fidx));
-  // bool forward = true;
   bool forward = out_split.dir == kLeftDir;
-  // printf("best_thresh setter: %d, forward: %d\n", static_cast<int32_t>(best_thresh),
-  //        static_cast<int32_t>(forward));
-
   bst_bin_t partition = forward ? best_thresh + 1 : best_thresh;
   auto beg = dh::tcbegin(f_sorted_idx);
   assert(partition > 0 && "Invalid partition.");
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index b624152eecfd..644eadc30f81 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -196,8 +196,6 @@ class HistEvaluator {
       CHECK_GT(partition, 0);
       std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition,
                     [&](size_t c) { cat_bits.Set(c); });  // fixme: cut_values[c]
-      // printf("best_thresh setter: %d, forward: %d\n", static_cast<int32_t>(best_thresh),
-      //        static_cast<int32_t>(d_step == 1));
     }
 
     p_best->Update(best);

From b1ada59a9601f16e71725949b975437967eb260a Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 03:45:47 +0800
Subject: [PATCH 06/19] Revert debug.

---
 tests/python-gpu/test_gpu_updaters.py |  3 +--
 tests/python/test_updaters.py         | 13 +++----------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index afb3dbe5cf85..b2f8b1a27050 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -3,7 +3,7 @@
 import gc
 import pytest
 import xgboost as xgb
-from hypothesis import given, strategies, assume, settings, note, reproduce_failure
+from hypothesis import given, strategies, assume, settings, note
 
 sys.path.append("tests/python")
 import testing as tm
@@ -84,7 +84,6 @@ def test_categorical_ohe(self, rows, cols, rounds, cats):
     )
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
-    @reproduce_failure('6.47.1', b'AACGAQE=')
     def test_categorical_missing(self, rows, cols, cats):
         self.cputest.run_categorical_missing(rows, cols, cats, "gpu_hist")
 
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 4b864e447f8e..7faa4c58b51d 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -246,16 +246,10 @@ def run(max_cat_to_onehot: int):
             booster = xgb.train(
                 parameters,
                 Xy,
-                num_boost_round=8,
+                num_boost_round=16,
                 evals=[(Xy, "Train")],
                 evals_result=evals_result
             )
-            import json
-            booster.save_model(f"{tree_method}.json")
-            with open(f"{tree_method}.json", "r") as fd:
-                model = json.load(fd)
-            with open(f"{tree_method}.json", "w") as fd:
-                json.dump(model, fd, indent=2)
             assert tm.non_increasing(evals_result["Train"]["rmse"])
             y_predt = booster.predict(Xy)
 
@@ -263,7 +257,7 @@ def run(max_cat_to_onehot: int):
             np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])
 
         # Test with OHE split
-        # run(self.USE_ONEHOT)
+        run(self.USE_ONEHOT)
 
         # Test with partition-based split
         run(self.USE_PART)
@@ -400,7 +394,6 @@ def test_categorical_ames_housing(
     )
     @settings(deadline=None, print_blob=True)
     @pytest.mark.skipif(**tm.no_pandas())
-    @reproduce_failure('6.47.1', b'AACGAQE=')
     def test_categorical_missing(self, rows, cols, cats):
-        # self.run_categorical_missing(rows, cols, cats, "approx")
+        self.run_categorical_missing(rows, cols, cats, "approx")
         self.run_categorical_missing(rows, cols, cats, "hist")

From 29515861c3494c6c5d5151d981cda81b9fd24fb9 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 03:48:25 +0800
Subject: [PATCH 07/19] Cleanup.

---
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 119 ------------------
 tests/cpp/tree/test_evaluate_splits.h         |  14 ---
 tests/python/test_updaters.py                 |   2 +-
 3 files changed, 1 insertion(+), 134 deletions(-)

diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 2837036df32d..df57d4f2dcad 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -19,125 +19,6 @@ auto ZeroParam() {
 }
 }  // anonymous namespace
 
-TEST(GpuHist, PartitionBasic) {
-  TrainParam tparam = ZeroParam();
-  tparam.max_cat_to_onehot = 0;
-  GPUTrainingParam param{tparam};
-
-  common::HistogramCuts cuts;
-  cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
-  cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
-  cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
-  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
-
-  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
-  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
-  common::Span<FeatureType> d_feature_types;
-  auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end());
-  cuts.SetCategorical(true, max_cat);
-  d_feature_types = dh::ToSpan(feature_types);
-
-  EvaluateSplitSharedInputs shared_inputs{
-      param,
-      d_feature_types,
-      cuts.cut_ptrs_.ConstDeviceSpan(),
-      cuts.cut_values_.ConstDeviceSpan(),
-      cuts.min_vals_.ConstDeviceSpan(),
-  };
-
-  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
-  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
-
-  {
-    // -1.0s go left
-    // -3.0s go right
-    GradientPairPrecise parent_sum(-5.0, 3.0);
-    thrust::device_vector<GradientPairPrecise> feature_histogram =
-        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};
-    EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
-                              dh::ToSpan(feature_histogram)};
-    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
-    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
-    EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
-    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
-    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
-  }
-
-  // {
-  //   // -1.0s go left
-  //   // -3.0s go right
-  //   GradientPairPrecise parent_sum(-7.0, 3.0);
-  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
-  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}};
-  //   EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
-  //                             dh::ToSpan(feature_histogram)};
-  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
-  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
-  //   EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
-  // }
-  // {
-  //   // All -1.0, gain from splitting should be 0.0
-  //   GradientPairPrecise parent_sum(-3.0, 3.0);
-  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
-  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}};
-  //   EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set),
-  //                             dh::ToSpan(feature_histogram)};
-  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
-  //   EXPECT_FLOAT_EQ(result.loss_chg, 0.0f);
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
-  // }
-  // With 3.0/3.0 missing values
-  // All categories go left
-  // missing values go right
-  // {
-  //   GradientPairPrecise parent_sum(0.0, 6.0);
-  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
-  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}};
-  //   EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
-  //                             dh::ToSpan(feature_histogram)};
-  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
-  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
-  //   EXPECT_EQ(cats, std::bitset<32>("11100000000000000000000000000000"));
-  //   EXPECT_EQ(result.dir, kRightDir);
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
-  // }
-  // {
-  //   // -1.0s go left
-  //   // -3.0s go right
-  //   GradientPairPrecise parent_sum(-5.0, 3.0);
-  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
-  //       std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}};
-  //   EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set),
-  //                             dh::ToSpan(feature_histogram)};
-  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
-  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
-  //   EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000"));
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
-  // }
-  // {
-  //   // -1.0s go left
-  //   // -3.0s go right
-  //   GradientPairPrecise parent_sum(-5.0, 3.0);
-  //   thrust::device_vector<GradientPairPrecise> feature_histogram =
-  //       std::vector<GradientPairPrecise>{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};
-  //   EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set),
-  //                             dh::ToSpan(feature_histogram)};
-  //   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
-  //   auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
-  //   EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000"));
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
-  //   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
-  // }
-}
-
 void TestEvaluateSingleSplit(bool is_categorical) {
   GradientPairPrecise parent_sum(0.0, 1.0);
   TrainParam tparam = ZeroParam();
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 7db556659e2b..50a089fd6997 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -97,19 +97,5 @@ class TestPartitionBasedSplit : public ::testing::Test {
     } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end()));
   }
 };
-
-class TestPartitionBasedEvaluation : public testing::Test {
-  common::HistogramCuts cuts;
-
- public:
-  void SetUp() override {
-    cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
-    cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
-    cuts.min_vals_.HostVector() = std::vector<float>{0.0};
-
-    auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end());
-    cuts.SetCategorical(true, max_cat);
-  }
-};
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 7faa4c58b51d..3e43b98ff113 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -5,7 +5,7 @@
 import pytest
 import xgboost as xgb
 import numpy as np
-from hypothesis import given, strategies, settings, note, reproduce_failure
+from hypothesis import given, strategies, settings, note
 
 exact_parameter_strategy = strategies.fixed_dictionaries({
     'nthread': strategies.integers(1, 4),

From 6c3de5eab5c91fa9bea2ed63d075460f733b1679 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 03:48:55 +0800
Subject: [PATCH 08/19] fix.

---
 src/tree/hist/evaluate_splits.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 644eadc30f81..5f737675b72e 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -143,6 +143,7 @@ class HistEvaluator {
     static_assert(d_step == +1 || d_step == -1, "Invalid step.");
 
     auto const &cut_ptr = cut.Ptrs();
+    auto const &cut_val = cut.Values();
     auto const &parent = snode_[nidx];
 
     bst_bin_t f_begin = cut_ptr[fidx];
@@ -194,8 +195,10 @@ class HistEvaluator {
       common::CatBitField cat_bits{best.cat_bits};
       bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin);
       CHECK_GT(partition, 0);
-      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition,
-                    [&](size_t c) { cat_bits.Set(c); });  // fixme: cut_values[c]
+      std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) {
+        auto cat = cut_val[c + f_begin];
+        cat_bits.Set(cat);
+      });
     }
 
     p_best->Update(best);

From 37c79cbcce840c91309d340ebaf32f540aaa18b1 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 04:16:08 +0800
Subject: [PATCH 09/19] Support max_cat_thresh.

---
 src/tree/gpu_hist/evaluate_splits.cu | 84 ++++++++++++++--------------
 src/tree/updater_gpu_common.cuh      |  4 +-
 2 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 58e5e41d140e..257a58000f1d 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -167,66 +167,66 @@ class EvaluateSplitAgent {
     }
   }
 
+  __device__ __forceinline__ void PartitionUpdate(bst_bin_t scan_begin, bool thread_active,
+                                                  bool missing_left, bst_bin_t it,
+                                                  GradientPairPrecise const &left_sum,
+                                                  GradientPairPrecise const &right_sum,
+                                                  DeviceSplitCandidate *__restrict__ best_split) {
+    auto gain =
+        thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum) : kNullGain;
+
+    // Find thread with best gain
+    auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
+    // This reduce result is only valid in thread 0
+    // broadcast to the rest of the warp
+    auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
+    // Best thread updates the split
+    if (threadIdx.x == best_thread) {
+      assert(thread_active);
+      // index of best threshold inside a feature.
+      auto best_thresh = it - gidx_begin;
+      best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left_sum,
+                            right_sum, param);
+    }
+  }
+
   __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
                                             common::Span<bst_feature_t> sorted_idx,
-                                            std::size_t offset) {
-    for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end - 1; scan_begin += kBlockSize) {
-      bool thread_active = (scan_begin + threadIdx.x) < (gidx_end - 1);
+                                            std::size_t offset, GPUTrainingParam const& param) {
+    bst_bin_t n_bins_feature = gidx_end - gidx_begin;
+    auto n_bins = std::min(param.max_cat_threshold, n_bins_feature);
 
-      auto right_sum =
-          thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset)
-                        : GradientPairPrecise();
+    bst_bin_t it_begin = gidx_begin;
+    bst_bin_t it_end = it_begin + n_bins - 1;
 
+    // forward
+    for (bst_bin_t scan_begin = it_begin; scan_begin < it_end; scan_begin += kBlockSize) {
+      auto it = scan_begin + threadIdx.x;
+      bool thread_active = it < it_end;
+
+      auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset)
+                                     : GradientPairPrecise();
       // No min value for cat feature, use inclusive scan.
       BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op);
       GradientPairPrecise left_sum = parent_sum - right_sum;
-      auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum)
-                                : kNullGain;
 
-      // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
-      // This reduce result is only valid in thread 0
-      // broadcast to the rest of the warp
-      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
-      // Best thread updates the split
-      if (threadIdx.x == best_thread) {
-        // index of best threshold inside a feature.
-        auto best_thresh = threadIdx.x + (scan_begin - gidx_begin);
-        best_split->UpdateCat(gain, kLeftDir, best_thresh, fidx, left_sum, right_sum, param);
-      }
+      PartitionUpdate(scan_begin, thread_active, true, it, left_sum, right_sum, best_split);
     }
-    cub::CTA_SYNC();
 
     // backward
-    bst_bin_t n_bins = gidx_end - gidx_begin;
-    bst_bin_t it_begin = gidx_end - 1;
-    bst_bin_t it_end = it_begin - n_bins + 1;
+    it_begin = gidx_end - 1;
+    it_end = it_begin - n_bins + 1;
     SumCallbackOp<GradientPairPrecise> backward_op;
     for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) {
       auto it = scan_begin - static_cast<bst_bin_t>(threadIdx.x);
       bool thread_active = it > it_end;
+
       auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset)
                                     : GradientPairPrecise();
+      // No min value for cat feature, use inclusive scan.
       BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, backward_op);
       GradientPairPrecise right_sum = parent_sum - left_sum;
-
-      auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum)
-                                : kNullGain;
-
-      // Find thread with best gain
-      auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax());
-      // This reduce result is only valid in thread 0
-      // broadcast to the rest of the warp
-      auto best_thread = __shfl_sync(0xffffffff, best.key, 0);
-      // Best thread updates the split
-      if (threadIdx.x == best_thread) {
-        assert(thread_active);
-        // index of best threshold inside a feature.
-        // auto best_thresh = (it_begin - scan_begin) - static_cast<int32_t>(threadIdx.x);
-        auto best_thresh = scan_begin - static_cast<int32_t>(threadIdx.x) - gidx_begin;
-        assert(best_thresh > 0);
-        best_split->UpdateCat(gain, kRightDir, best_thresh, fidx, left_sum, right_sum, param);
-      }
+      PartitionUpdate(scan_begin, thread_active, false, it, left_sum, right_sum, best_split);
     }
   }
 };
@@ -267,7 +267,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel(
       auto total_bins = shared_inputs.feature_values.size();
       size_t offset = total_bins * input_idx;
       auto node_sorted_idx = sorted_idx.subspan(offset, total_bins);
-      agent.Partition(&best_split, node_sorted_idx, offset);
+      agent.Partition(&best_split, node_sorted_idx, offset, shared_inputs.param);
     }
   } else {
     agent.Numerical(&best_split);
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index b9a31dd32477..891ba2a914ac 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -29,6 +29,7 @@ struct GPUTrainingParam {
   float max_delta_step;
   float learning_rate;
   uint32_t max_cat_to_onehot;
+  bst_bin_t max_cat_threshold;
 
   GPUTrainingParam() = default;
 
@@ -38,7 +39,8 @@ struct GPUTrainingParam {
         reg_alpha(param.reg_alpha),
         max_delta_step(param.max_delta_step),
         learning_rate{param.learning_rate},
-        max_cat_to_onehot{param.max_cat_to_onehot} {}
+        max_cat_to_onehot{param.max_cat_to_onehot},
+        max_cat_threshold{param.max_cat_threshold} {}
 };
 
 /**

From e52b4e5379d8f3e1348263db9223bc07a9c4d2a2 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 04:20:17 +0800
Subject: [PATCH 10/19] Tests.

---
 tests/python-gpu/test_gpu_updaters.py | 43 +++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index b2f8b1a27050..d0e7c5bc883d 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -1,3 +1,4 @@
+from typing import Dict, Any
 import numpy as np
 import sys
 import gc
@@ -77,6 +78,48 @@ def test_sparse(self, dataset):
     def test_categorical_ohe(self, rows, cols, rounds, cats):
         self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist")
 
+    @given(
+        tm.categorical_dataset_strategy,
+        test_up.exact_parameter_strategy,
+        test_up.hist_parameter_strategy,
+        test_up.cat_parameter_strategy,
+        strategies.integers(4, 32),
+    )
+    @settings(deadline=None, print_blob=True)
+    @pytest.mark.skipif(**tm.no_pandas())
+    def test_categorical(
+        self,
+        dataset: tm.TestDataset,
+        exact_parameters: Dict[str, Any],
+        hist_parameters: Dict[str, Any],
+        cat_parameters: Dict[str, Any],
+        n_rounds: int,
+    ) -> None:
+        cat_parameters.update(exact_parameters)
+        cat_parameters.update(hist_parameters)
+        cat_parameters["tree_method"] = "gpu_hist"
+
+        results = train_result(cat_parameters, dataset.get_dmat(), n_rounds)
+        tm.non_increasing(results["train"]["rmse"])
+
+    @given(
+        test_up.hist_parameter_strategy,
+        test_up.cat_parameter_strategy,
+    )
+    @settings(deadline=None, print_blob=True)
+    def test_categorical_ames_housing(
+        self,
+        hist_parameters: Dict[str, Any],
+        cat_parameters: Dict[str, Any],
+    ) -> None:
+        cat_parameters.update(hist_parameters)
+        dataset = tm.TestDataset(
+            "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse"
+        )
+        cat_parameters["tree_method"] = "gpu_hist"
+        results = train_result(cat_parameters, dataset.get_dmat(), 16)
+        tm.non_increasing(results["train"]["rmse"])
+
     @given(
         strategies.integers(10, 400),
         strategies.integers(3, 8),

From 408f9bcf355e0e7cc3ed2c7399394746db283ea7 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 04:32:48 +0800
Subject: [PATCH 11/19] Cleanup.

---
 src/tree/gpu_hist/evaluate_splits.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index 257a58000f1d..f07b28a8687b 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -216,7 +216,7 @@ class EvaluateSplitAgent {
     // backward
     it_begin = gidx_end - 1;
     it_end = it_begin - n_bins + 1;
-    SumCallbackOp<GradientPairPrecise> backward_op;
+    prefix_op = SumCallbackOp<GradientPairPrecise>{};  // reset
     for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) {
       auto it = scan_begin - static_cast<bst_bin_t>(threadIdx.x);
       bool thread_active = it > it_end;
@@ -224,7 +224,7 @@ class EvaluateSplitAgent {
       auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset)
                                     : GradientPairPrecise();
       // No min value for cat feature, use inclusive scan.
-      BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, backward_op);
+      BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, prefix_op);
       GradientPairPrecise right_sum = parent_sum - left_sum;
       PartitionUpdate(scan_begin, thread_active, false, it, left_sum, right_sum, best_split);
     }

From 2732c6b6c2d5203919925e77295bf6b7bcf5226f Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 04:43:14 +0800
Subject: [PATCH 12/19] Size.

---
 src/tree/hist/evaluate_splits.h | 2 +-
 src/tree/updater_gpu_hist.cu    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 5f737675b72e..0a09718ef911 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -190,7 +190,7 @@ class HistEvaluator {
     }
 
     if (best_thresh != -1) {
-      auto n = common::CatBitField::ComputeStorageSize(n_bins_feature + 1);
+      auto n = common::CatBitField::ComputeStorageSize(n_bins_feature);
       best.cat_bits = decltype(best.cat_bits)(n, 0);
       common::CatBitField cat_bits{best.cat_bits};
       bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index bafe800cd98b..e1a940a9df34 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -608,7 +608,7 @@ struct GPUHistMakerDevice {
       CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
       auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
       auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex);
-      split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature + 1), 0);
+      split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0);
       CHECK_LE(split_cats.size(), h_cats.size());
       std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
 

From 3d877908b57f83a89a6d00688b1bbb630b756cc4 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 04:44:21 +0800
Subject: [PATCH 13/19] lint.

---
 python-package/xgboost/dask.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index 951676a81757..75eeba875fee 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -1684,7 +1684,7 @@ async def _() -> Awaitable[Any]:
 
     def __getstate__(self) -> Dict:
         this = self.__dict__.copy()
-        if "_client" in this.keys():
+        if "_client" in this:
             del this["_client"]
         return this
 

From 93aab02522c49f329cbd9d8cb37ebc0990cc08fd Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 16:17:45 +0800
Subject: [PATCH 14/19] Fix test.

---
 tests/cpp/tree/gpu_hist/test_evaluate_splits.cu | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index df57d4f2dcad..99802b61034f 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -68,7 +68,11 @@ void TestEvaluateSingleSplit(bool is_categorical) {
   DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
 
   EXPECT_EQ(result.findex, 1);
-  EXPECT_EQ(result.fvalue, 11.0);
+  if (is_categorical) {
+    ASSERT_TRUE(std::isnan(result.fvalue));
+  } else {
+    EXPECT_EQ(result.fvalue, 11.0);
+  }
   EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(),
                   parent_sum.GetGrad());
   EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(),

From 19a8fbd04e30b4a2aede3672bb0ec5602b0bb0fc Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Wed, 31 Aug 2022 17:52:44 +0800
Subject: [PATCH 15/19] Add quick test.

---
 src/tree/gpu_hist/evaluate_splits.cu          | 19 ++++--
 src/tree/updater_gpu_common.cuh               |  1 +
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 58 ++++++++++++-------
 tests/cpp/tree/hist/test_evaluate_splits.cc   | 28 +++++++++
 tests/cpp/tree/test_evaluate_splits.h         | 54 +++++++++++++++++
 5 files changed, 134 insertions(+), 26 deletions(-)

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
index f07b28a8687b..0888e590ba94 100644
--- a/src/tree/gpu_hist/evaluate_splits.cu
+++ b/src/tree/gpu_hist/evaluate_splits.cu
@@ -166,7 +166,9 @@ class EvaluateSplitAgent {
       }
     }
   }
-
+  /**
+   * \brief Gather and update the best split.
+   */
   __device__ __forceinline__ void PartitionUpdate(bst_bin_t scan_begin, bool thread_active,
                                                   bool missing_left, bst_bin_t it,
                                                   GradientPairPrecise const &left_sum,
@@ -189,10 +191,13 @@ class EvaluateSplitAgent {
                             right_sum, param);
     }
   }
-
+  /**
+   * \brief Partition-based split for categorical feature.
+   */
   __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split,
                                             common::Span<bst_feature_t> sorted_idx,
-                                            std::size_t offset, GPUTrainingParam const& param) {
+                                            std::size_t node_offset,
+                                            GPUTrainingParam const &param) {
     bst_bin_t n_bins_feature = gidx_end - gidx_begin;
     auto n_bins = std::min(param.max_cat_threshold, n_bins_feature);
 
@@ -201,10 +206,10 @@ class EvaluateSplitAgent {
 
     // forward
     for (bst_bin_t scan_begin = it_begin; scan_begin < it_end; scan_begin += kBlockSize) {
-      auto it = scan_begin + threadIdx.x;
+      auto it = scan_begin + static_cast<bst_bin_t>(threadIdx.x);
       bool thread_active = it < it_end;
 
-      auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset)
+      auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - node_offset)
                                      : GradientPairPrecise();
       // No min value for cat feature, use inclusive scan.
       BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op);
@@ -217,15 +222,17 @@ class EvaluateSplitAgent {
     it_begin = gidx_end - 1;
     it_end = it_begin - n_bins + 1;
     prefix_op = SumCallbackOp<GradientPairPrecise>{};  // reset
+
     for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) {
       auto it = scan_begin - static_cast<bst_bin_t>(threadIdx.x);
       bool thread_active = it > it_end;
 
-      auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset)
+      auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - node_offset)
                                     : GradientPairPrecise();
       // No min value for cat feature, use inclusive scan.
       BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, prefix_op);
       GradientPairPrecise right_sum = parent_sum - left_sum;
+
       PartitionUpdate(scan_begin, thread_active, false, it, left_sum, right_sum, best_split);
     }
   }
diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh
index 891ba2a914ac..49c184a4abbb 100644
--- a/src/tree/updater_gpu_common.cuh
+++ b/src/tree/updater_gpu_common.cuh
@@ -132,6 +132,7 @@ struct DeviceSplitCandidate {
        << "dir: " << c.dir << ", "
        << "findex: " << c.findex << ", "
        << "fvalue: " << c.fvalue << ", "
+       << "thresh: " << c.thresh << ", "
        << "is_cat: " << c.is_cat << ", "
        << "left sum: " << c.left_sum << ", "
        << "right sum: " << c.right_sum << std::endl;
diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 99802b61034f..2bf798279342 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -2,6 +2,7 @@
  * Copyright 2020-2022 by XGBoost contributors
  */
 #include <gtest/gtest.h>
+
 #include "../../../../src/tree/gpu_hist/evaluate_splits.cuh"
 #include "../../helpers.h"
 #include "../../histogram_helpers.h"
@@ -17,29 +18,53 @@ auto ZeroParam() {
   tparam.UpdateAllowUnknown(args);
   return tparam;
 }
+
 }  // anonymous namespace
 
+TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
+  GPUTrainingParam param{param_};
+  cuts_.cut_ptrs_.SetDevice(0);
+  cuts_.cut_values_.SetDevice(0);
+  cuts_.min_vals_.SetDevice(0);
+  thrust::device_vector<GradientPairPrecise> feature_histogram{feature_histogram_};
+
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  auto d_feature_types = dh::ToSpan(feature_types);
+
+  EvaluateSplitInputs input{1, 0, parent_sum_, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
+  EvaluateSplitSharedInputs shared_inputs{
+      param,
+      d_feature_types,
+      cuts_.cut_ptrs_.ConstDeviceSpan(),
+      cuts_.cut_values_.ConstDeviceSpan(),
+      cuts_.min_vals_.ConstDeviceSpan(),
+  };
+
+  GPUHistEvaluator evaluator{param_, static_cast<bst_feature_t>(feature_set.size()), 0};
+
+  evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, 0);
+  DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+
+  ASSERT_EQ(result.thresh, 1);
+  this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat,
+                    result.dir == kLeftDir, result.left_sum, result.right_sum);
+}
+
 void TestEvaluateSingleSplit(bool is_categorical) {
   GradientPairPrecise parent_sum(0.0, 1.0);
   TrainParam tparam = ZeroParam();
   GPUTrainingParam param{tparam};
 
-  common::HistogramCuts cuts;
-  cuts.cut_values_.HostVector() = std::vector<float>{1.0, 2.0, 11.0, 12.0};
-  cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 2, 4};
-  cuts.min_vals_.HostVector() =  std::vector<float>{0.0, 0.0};
-  cuts.cut_ptrs_.SetDevice(0);
-  cuts.cut_values_.SetDevice(0);
-  cuts.min_vals_.SetDevice(0);
-  thrust::device_vector<bst_feature_t> feature_set =
-      std::vector<bst_feature_t>{0, 1};
+  common::HistogramCuts cuts{MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, 0)};
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
 
   // Setup gradients so that second feature gets higher gain
   thrust::device_vector<GradientPairPrecise> feature_histogram =
       std::vector<GradientPairPrecise>{
           {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}};
 
-  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
   dh::device_vector<FeatureType> feature_types(feature_set.size(),
                                                FeatureType::kCategorical);
   common::Span<FeatureType> d_feature_types;
@@ -50,10 +75,8 @@ void TestEvaluateSingleSplit(bool is_categorical) {
     d_feature_types = dh::ToSpan(feature_types);
   }
 
-  EvaluateSplitInputs input{1,0,
-                                          parent_sum,
-                                          dh::ToSpan(feature_set),
-                                          dh::ToSpan(feature_histogram)};
+  EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                            dh::ToSpan(feature_histogram)};
   EvaluateSplitSharedInputs shared_inputs{
       param,
       d_feature_types,
@@ -83,7 +106,7 @@ TEST(GpuHist, EvaluateSingleSplit) {
   TestEvaluateSingleSplit(false);
 }
 
-TEST(GpuHist, EvaluateCategoricalSplit) {
+TEST(GpuHist, EvaluateSingleCategoricalSplit) {
   TestEvaluateSingleSplit(true);
 }
 
@@ -100,7 +123,6 @@ TEST(GpuHist, EvaluateSingleSplitMissing) {
   thrust::device_vector<float> feature_min_values = std::vector<float>{0.0};
   thrust::device_vector<GradientPairPrecise> feature_histogram =
       std::vector<GradientPairPrecise>{{-0.5, 0.5}, {0.5, 0.5}};
-  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
   EvaluateSplitInputs input{1,0,
                                           parent_sum,
                                           dh::ToSpan(feature_set),
@@ -150,7 +172,6 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) {
   thrust::device_vector<GradientPairPrecise> feature_histogram =
       std::vector<GradientPairPrecise>{
           {-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
-  thrust::device_vector<int> monotonic_constraints(2, 0);
   EvaluateSplitInputs input{1,0,
                                           parent_sum,
                                           dh::ToSpan(feature_set),
@@ -190,7 +211,6 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) {
   thrust::device_vector<GradientPairPrecise> feature_histogram =
       std::vector<GradientPairPrecise>{
           {-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
-  thrust::device_vector<int> monotonic_constraints(2, 0);
   EvaluateSplitInputs input{1,0,
                                           parent_sum,
                                           dh::ToSpan(feature_set),
@@ -231,7 +251,6 @@ TEST(GpuHist, EvaluateSplits) {
   thrust::device_vector<GradientPairPrecise> feature_histogram_right =
       std::vector<GradientPairPrecise>{
           {-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}};
-  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
   EvaluateSplitInputs input_left{
       1,0,
       parent_sum,
@@ -290,6 +309,5 @@ TEST_F(TestPartitionBasedSplit, GpuHist) {
   auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
   ASSERT_NEAR(split.loss_chg, best_score_, 1e-16);
 }
-
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 0dede27066a8..37c8597cab53 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -185,5 +185,33 @@ TEST(HistEvaluator, Categorical) {
 
   ASSERT_EQ(with_onehot.split.loss_chg, with_part.split.loss_chg);
 }
+
+TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) {
+  common::HistCollection hist;
+  hist.Init(cuts_.TotalBins());
+  hist.AddHistRow(0);
+  hist.AllocateAllData();
+  auto node_hist = hist[0];
+  ASSERT_EQ(node_hist.size(), feature_histogram_.size());
+  std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin());
+
+  auto sampler = std::make_shared<common::ColumnSampler>();
+  MetaInfo info;
+  info.num_col_ = 1;
+  info.feature_types = {FeatureType::kCategorical};
+  auto evaluator =
+      HistEvaluator<CPUExpandEntry>{param_, info, common::OmpGetNumThreads(0), sampler};
+  evaluator.InitRoot(GradStats{parent_sum_});
+
+  std::vector<CPUExpandEntry> entries(1);
+  RegTree tree;
+  evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries);
+  auto const& split = entries.front().split;
+
+  this->CheckResult(split.loss_chg, split.SplitIndex(), split.split_value, split.is_cat,
+                    split.DefaultLeft(),
+                    GradientPairPrecise{split.left_sum.GetGrad(), split.left_sum.GetHess()},
+                    GradientPairPrecise{split.right_sum.GetGrad(), split.right_sum.GetHess()});
+}
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h
index 50a089fd6997..2421b8ba0e41 100644
--- a/tests/cpp/tree/test_evaluate_splits.h
+++ b/tests/cpp/tree/test_evaluate_splits.h
@@ -97,5 +97,59 @@ class TestPartitionBasedSplit : public ::testing::Test {
     } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end()));
   }
 };
+
+inline auto MakeCutsForTest(std::vector<float> values, std::vector<uint32_t> ptrs,
+                            std::vector<float> min_values, int32_t device) {
+  common::HistogramCuts cuts;
+  cuts.cut_values_.HostVector() = values;
+  cuts.cut_ptrs_.HostVector() = ptrs;
+  cuts.min_vals_.HostVector() = min_values;
+
+  if (device >= 0) {
+    cuts.cut_ptrs_.SetDevice(device);
+    cuts.cut_values_.SetDevice(device);
+    cuts.min_vals_.SetDevice(device);
+  }
+
+  return cuts;
+}
+
+class TestCategoricalSplitWithMissing : public testing::Test {
+ protected:
+  common::HistogramCuts cuts_;
+  // Setup gradients and parent sum with missing values.
+  GradientPairPrecise parent_sum_{1.0, 6.0};
+  std::vector<GradientPairPrecise> feature_histogram_{
+      {0.5, 0.5}, {0.5, 0.5}, {1.0, 1.0}, {1.0, 1.0}};
+  TrainParam param_;
+
+  void SetUp() override {
+    cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, -1);
+    auto max_cat = *std::max_element(cuts_.cut_values_.HostVector().begin(),
+                                     cuts_.cut_values_.HostVector().end());
+    cuts_.SetCategorical(true, max_cat);
+    param_.UpdateAllowUnknown(
+        Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}, {"max_cat_to_onehot", "1"}});
+  }
+
+  void CheckResult(float loss_chg, bst_feature_t split_ind, float fvalue, bool is_cat,
+                   bool dft_left, GradientPairPrecise left_sum, GradientPairPrecise right_sum) {
+    // forward
+    // it: 0, gain: 0.545455
+    // it: 1, gain: 1.000000
+    // it: 2, gain: 2.250000
+    // backward
+    // it: 3, gain: 1.000000
+    // it: 2, gain: 2.250000
+    // it: 1, gain: 3.142857
+    ASSERT_NEAR(loss_chg, 2.97619, kRtEps);
+    ASSERT_TRUE(is_cat);
+    ASSERT_TRUE(std::isnan(fvalue));
+    ASSERT_EQ(split_ind, 0);
+    ASSERT_FALSE(dft_left);
+    ASSERT_EQ(left_sum.GetHess(), 2.5);
+    ASSERT_EQ(right_sum.GetHess(), parent_sum_.GetHess() - left_sum.GetHess());
+  }
+};
 }  // namespace tree
 }  // namespace xgboost

From fb4bba43a29734f055ee11d518689e3d9ec4e9ba Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 6 Sep 2022 17:12:10 +0800
Subject: [PATCH 16/19] Add tests from Rory.

---
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 236 ++++++++++++++++++
 1 file changed, 236 insertions(+)

diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 2bf798279342..6636dc4c8667 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -52,6 +52,242 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) {
                     result.dir == kLeftDir, result.left_sum, result.right_sum);
 }
 
+TEST(GpuHist, PartitionBasic) {
+  TrainParam tparam = ZeroParam();
+  tparam.max_cat_to_onehot = 0;
+  GPUTrainingParam param{tparam};
+
+  common::HistogramCuts cuts;
+  cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
+  cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
+  cuts.min_vals_.HostVector() = std::vector<float>{0.0};
+  cuts.cut_ptrs_.SetDevice(0);
+  cuts.cut_values_.SetDevice(0);
+  cuts.min_vals_.SetDevice(0);
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
+
+  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  common::Span<FeatureType> d_feature_types;
+  auto max_cat =
+      *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
+  cuts.SetCategorical(true, max_cat);
+  d_feature_types = dh::ToSpan(feature_types);
+
+  EvaluateSplitSharedInputs shared_inputs{
+      param,
+      d_feature_types,
+      cuts.cut_ptrs_.ConstDeviceSpan(),
+      cuts.cut_values_.ConstDeviceSpan(),
+      cuts.min_vals_.ConstDeviceSpan(),
+  };
+
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+
+  {
+    // -1.0s go left
+    // -3.0s go right
+    GradientPairPrecise parent_sum(-5.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};
+    EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+
+  {
+    // -1.0s go left
+    // -3.0s go right
+    GradientPairPrecise parent_sum(-7.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}};
+    EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+  {
+    // All -1.0, gain from splitting should be 0.0
+    GradientPairPrecise parent_sum(-3.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}};
+    EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    EXPECT_FLOAT_EQ(result.loss_chg, 0.0f);
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+  // With 3.0/3.0 missing values
+  // All categories go left
+  // missing values go right
+  {
+    GradientPairPrecise parent_sum(0.0, 6.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}};
+    EvaluateSplitInputs input{3, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
+    EXPECT_EQ(result.dir, kLeftDir);
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+  {
+    // -1.0s go left
+    // -3.0s go right
+    GradientPairPrecise parent_sum(-5.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}};
+    EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000"));
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+  {
+    // -1.0s go left
+    // -3.0s go right
+    GradientPairPrecise parent_sum(-5.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram =
+        std::vector<GradientPairPrecise>{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};
+    EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000"));
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+}
+
+TEST(GpuHist, PartitionTwoFeatures) {
+  TrainParam tparam = ZeroParam();
+  tparam.max_cat_to_onehot = 0;
+  GPUTrainingParam param{tparam};
+
+  common::HistogramCuts cuts;
+  cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0, 0.0, 1.0, 2.0};
+  cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3, 6};
+  cuts.min_vals_.HostVector() = std::vector<float>{0.0, 0.0};
+  cuts.cut_ptrs_.SetDevice(0);
+  cuts.cut_values_.SetDevice(0);
+  cuts.min_vals_.SetDevice(0);
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0, 1};
+
+  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  common::Span<FeatureType> d_feature_types(dh::ToSpan(feature_types));
+  auto max_cat =
+      *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
+  cuts.SetCategorical(true, max_cat);
+
+  EvaluateSplitSharedInputs shared_inputs{
+      param,
+      d_feature_types,
+      cuts.cut_ptrs_.ConstDeviceSpan(),
+      cuts.cut_values_.ConstDeviceSpan(),
+      cuts.min_vals_.ConstDeviceSpan(),
+  };
+
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+
+  {
+    GradientPairPrecise parent_sum(-6.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram = std::vector<GradientPairPrecise>{
+        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}};
+    EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(result.findex, 1);
+    EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+
+  {
+    GradientPairPrecise parent_sum(-6.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram = std::vector<GradientPairPrecise>{
+        {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}};
+    EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set),
+                              dh::ToSpan(feature_histogram)};
+    DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(result.findex, 1);
+    EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
+    EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
+    EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
+  }
+}
+
+TEST(GpuHist, PartitionTwoNodes) {
+  TrainParam tparam = ZeroParam();
+  tparam.max_cat_to_onehot = 0;
+  GPUTrainingParam param{tparam};
+
+  common::HistogramCuts cuts;
+  cuts.cut_values_.HostVector() = std::vector<float>{0.0, 1.0, 2.0};
+  cuts.cut_ptrs_.HostVector() = std::vector<uint32_t>{0, 3};
+  cuts.min_vals_.HostVector() = std::vector<float>{0.0};
+  cuts.cut_ptrs_.SetDevice(0);
+  cuts.cut_values_.SetDevice(0);
+  cuts.min_vals_.SetDevice(0);
+  thrust::device_vector<bst_feature_t> feature_set = std::vector<bst_feature_t>{0};
+
+  thrust::device_vector<int> monotonic_constraints(feature_set.size(), 0);
+  dh::device_vector<FeatureType> feature_types(feature_set.size(), FeatureType::kCategorical);
+  common::Span<FeatureType> d_feature_types(dh::ToSpan(feature_types));
+  auto max_cat =
+      *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end());
+  cuts.SetCategorical(true, max_cat);
+
+  EvaluateSplitSharedInputs shared_inputs{
+      param,
+      d_feature_types,
+      cuts.cut_ptrs_.ConstDeviceSpan(),
+      cuts.cut_values_.ConstDeviceSpan(),
+      cuts.min_vals_.ConstDeviceSpan(),
+  };
+
+  GPUHistEvaluator evaluator{tparam, static_cast<bst_feature_t>(feature_set.size()), 0};
+  evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
+
+  {
+    GradientPairPrecise parent_sum(-6.0, 3.0);
+    thrust::device_vector<GradientPairPrecise> feature_histogram_a =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0},
+                                         {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}};
+    thrust::device_vector<EvaluateSplitInputs> inputs(2);
+    inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set),
+                                    dh::ToSpan(feature_histogram_a)};
+    thrust::device_vector<GradientPairPrecise> feature_histogram_b =
+        std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}};
+    inputs[1] = EvaluateSplitInputs{1, 0, parent_sum, dh::ToSpan(feature_set),
+                                    dh::ToSpan(feature_histogram_b)};
+    thrust::device_vector<GPUExpandEntry> results(2);
+    evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results));
+    GPUExpandEntry result_a = results[0];
+    GPUExpandEntry result_b = results[1];
+    EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]),
+              std::bitset<32>("10000000000000000000000000000000"));
+    EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]),
+              std::bitset<32>("11000000000000000000000000000000"));
+  }
+}
+
 void TestEvaluateSingleSplit(bool is_categorical) {
   GradientPairPrecise parent_sum(0.0, 1.0);
   TrainParam tparam = ZeroParam();

From 76ac92da70e6c5865a8ac38fff205cf725d3f035 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 6 Sep 2022 17:17:57 +0800
Subject: [PATCH 17/19] add check for direction.

---
 .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 6636dc4c8667..5b070921925b 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -86,8 +86,8 @@ TEST(GpuHist, PartitionBasic) {
   evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0);
 
   {
-    // -1.0s go left
-    // -3.0s go right
+    // -1.0s go right
+    // -3.0s go left
     GradientPairPrecise parent_sum(-5.0, 3.0);
     thrust::device_vector<GradientPairPrecise> feature_histogram =
         std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};
@@ -95,14 +95,15 @@ TEST(GpuHist, PartitionBasic) {
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000"));
     EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
   }
 
   {
-    // -1.0s go left
-    // -3.0s go right
+    // -1.0s go right
+    // -3.0s go left
     GradientPairPrecise parent_sum(-7.0, 3.0);
     thrust::device_vector<GradientPairPrecise> feature_histogram =
         std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}};
@@ -110,6 +111,7 @@ TEST(GpuHist, PartitionBasic) {
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000"));
     EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
@@ -122,13 +124,13 @@ TEST(GpuHist, PartitionBasic) {
     EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set),
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
+    EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_FLOAT_EQ(result.loss_chg, 0.0f);
     EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
   }
   // With 3.0/3.0 missing values
-  // All categories go left
-  // missing values go right
+  // Forward, first 2 categories are selected, while the last one go to left along with missing value
   {
     GradientPairPrecise parent_sum(0.0, 6.0);
     thrust::device_vector<GradientPairPrecise> feature_histogram =
@@ -143,8 +145,8 @@ TEST(GpuHist, PartitionBasic) {
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
   }
   {
-    // -1.0s go left
-    // -3.0s go right
+    // -1.0s go right
+    // -3.0s go left
     GradientPairPrecise parent_sum(-5.0, 3.0);
     thrust::device_vector<GradientPairPrecise> feature_histogram =
         std::vector<GradientPairPrecise>{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}};
@@ -152,6 +154,7 @@ TEST(GpuHist, PartitionBasic) {
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000"));
     EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
@@ -166,6 +169,7 @@ TEST(GpuHist, PartitionBasic) {
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
+    EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000"));
     EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());

From 4acc9b4781fce30a4785d9e8a37bebe0b1c356f0 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 6 Sep 2022 17:26:46 +0800
Subject: [PATCH 18/19] comment.

---
 tests/cpp/tree/gpu_hist/test_evaluate_splits.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 5b070921925b..9228074625a0 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -160,8 +160,8 @@ TEST(GpuHist, PartitionBasic) {
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());
   }
   {
-    // -1.0s go left
-    // -3.0s go right
+    // -1.0s go right
+    // -3.0s go left
     GradientPairPrecise parent_sum(-5.0, 3.0);
     thrust::device_vector<GradientPairPrecise> feature_histogram =
         std::vector<GradientPairPrecise>{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}};

From 8e43aa39c6f2d0cc4bbfe654c583ab45e146de82 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 6 Sep 2022 18:16:20 +0800
Subject: [PATCH 19/19] floating point.

---
 tests/cpp/tree/gpu_hist/test_evaluate_splits.cu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
index 9228074625a0..6ef8350bc505 100644
--- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
+++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu
@@ -169,7 +169,6 @@ TEST(GpuHist, PartitionBasic) {
                               dh::ToSpan(feature_histogram)};
     DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split;
     auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]);
-    EXPECT_EQ(result.dir, kLeftDir);
     EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000"));
     EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad());
     EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());