dmlc · RAMitchell · May 5, 2022 · May 1, 2022 · May 2, 2022 · May 2, 2022
diff --git a/src/common/categorical.h b/src/common/categorical.h
@@ -82,8 +82,8 @@ inline void InvalidCategory() {
 /*!
  * \brief Whether should we use onehot encoding for categorical data.
  */
-XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
-  bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
+XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot) {
+  bool use_one_hot = n_cats < max_cat_to_onehot;
   return use_one_hot;
 }
 

diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu
@@ -241,7 +241,7 @@ __global__ void EvaluateSplitsKernel(
 
   if (common::IsCat(inputs.feature_types, fidx)) {
     auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx];
-    if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot, task)) {
+    if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot)) {
       EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
                       kOneHot>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
     } else {
@@ -273,12 +273,19 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a,
  * \brief Set the bits for categorical splits based on the split threshold.
  */
 template <typename GradientSumT>
-__device__ void SortBasedSplit(EvaluateSplitInputs<GradientSumT> const &input,
+__device__ void SetCategoricalSplit(EvaluateSplitInputs<GradientSumT> const &input,
                                common::Span<bst_feature_t const> d_sorted_idx, bst_feature_t fidx,
                                bool is_left, common::Span<common::CatBitField::value_type> out,
-                               DeviceSplitCandidate *p_out_split) {
+                               DeviceSplitCandidate *p_out_split, ObjInfo task) {
   auto &out_split = *p_out_split;
   out_split.split_cats = common::CatBitField{out};
+
+  // Simple case for one hot split
+  if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) {
+    out_split.split_cats.Set(common::AsCat(out_split.fvalue));
+    return;
+  }
+
   auto node_sorted_idx =
       is_left ? d_sorted_idx.subspan(0, input.feature_values.size())
               : d_sorted_idx.subspan(input.feature_values.size(), input.feature_values.size());
@@ -313,7 +320,7 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
     EvaluateSplitInputs<GradientSumT> left, EvaluateSplitInputs<GradientSumT> right, ObjInfo task,
     TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
     common::Span<DeviceSplitCandidate> out_splits) {
-  if (!split_cats_.empty()) {
+  if (need_sort_histogram_) {
     this->SortHistogram(left, right, evaluator);
   }
 
@@ -354,14 +361,13 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
 template <typename GradientSumT>
 void GPUHistEvaluator<GradientSumT>::CopyToHost(EvaluateSplitInputs<GradientSumT> const &input,
                                                 common::Span<CatST> cats_out) {
-  if (has_sort_) {
-    dh::CUDAEvent event;
-    event.Record(dh::DefaultStream());
-    auto h_cats = this->HostCatStorage(input.nidx);
-    copy_stream_.View().Wait(event);
-    dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
-                                  cudaMemcpyDeviceToHost, copy_stream_.View()));
-  }
+  if (cats_out.empty()) return;
+  dh::CUDAEvent event;
+  event.Record(dh::DefaultStream());
+  auto h_cats = this->HostCatStorage(input.nidx);
+  copy_stream_.View().Wait(event);
+  dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
+                                cudaMemcpyDeviceToHost, copy_stream_.View()));
 }
 
 template <typename GradientSumT>
@@ -378,17 +384,16 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, Ob
   auto d_sorted_idx = this->SortedIdx(left);
   auto d_entries = out_entries;
   auto cats_out = this->DeviceCatStorage(left.nidx);
-  // turn candidate into entry, along with hanlding sort based split.
+  // turn candidate into entry, along with handling sort based split.
   dh::LaunchN(right.feature_set.empty() ? 1 : 2, [=] __device__(size_t i) {
     auto const &input = i == 0 ? left : right;
     auto &split = out_splits[i];
     auto fidx = out_splits[i].findex;
 
-    if (split.is_cat &&
-        !common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
+    if (split.is_cat) {
       bool is_left = i == 0;
       auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2);
-      SortBasedSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]);
+      SetCategoricalSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i], task);
     }
 
     float base_weight =
@@ -420,9 +425,8 @@ GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
     auto &split = out_split[i];
     auto fidx = out_split[i].findex;
 
-    if (split.is_cat &&
-        !common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
-      SortBasedSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]);
+    if (split.is_cat) {
+      SetCategoricalSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i], task);
     }
 
     float left_weight = evaluator.CalcWeight(0, input.param, GradStats{split.left_sum});

diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh
@@ -58,9 +58,12 @@ class GPUHistEvaluator {
   dh::device_vector<bst_feature_t> feature_idx_;
   // Training param used for evaluation
   TrainParam param_;
-  // whether the input data requires sort based split, which is more complicated so we try
-  // to avoid it if possible.
-  bool has_sort_{false};
+  // Do we have any categorical features that require sorting histograms?
+  // use this to skip the expensive sort step
+  bool need_sort_histogram_ = false;
+  // Number of elements of categorical storage type
+  // needed to hold categoricals for a single mode
+  std::size_t node_categorical_storage_size_ = 0;
 
   // Copy the categories from device to host asynchronously.
   void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
@@ -69,25 +72,33 @@ class GPUHistEvaluator {
    * \brief Get host category storage of nidx for internal calculation.
    */
   auto HostCatStorage(bst_node_t nidx) {
-    auto cat_bits = h_split_cats_.size() / param_.MaxNodes();
+
+    std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
+    if(h_split_cats_.size()<min_size){
+      h_split_cats_.resize(min_size);
+    }
+
     if (nidx == RegTree::kRoot) {
-      auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
+      auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
       return cats_out;
     }
-    auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits * 2);
+    auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
     return cats_out;
   }
 
   /**
    * \brief Get device category storage of nidx for internal calculation.
    */
   auto DeviceCatStorage(bst_node_t nidx) {
-    auto cat_bits = split_cats_.size() / param_.MaxNodes();
+    std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
+    if(split_cats_.size()<min_size){
+      split_cats_.resize(min_size);
+    }
     if (nidx == RegTree::kRoot) {
-      auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits);
+      auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
       return cats_out;
     }
-    auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits * 2);
+    auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
     return cats_out;
   }
 
@@ -114,7 +125,7 @@ class GPUHistEvaluator {
   /**
    * \brief Reset the evaluator, should be called before any use.
    */
-  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft, ObjInfo task,
+  void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
              bst_feature_t n_features, TrainParam const &param, int32_t device);
 
   /**
@@ -123,8 +134,7 @@ class GPUHistEvaluator {
    */
   common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
     copy_stream_.View().Sync();
-    auto cat_bits = h_split_cats_.size() / param_.MaxNodes();
-    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
+    auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
     return cats_out;
   }
   /**

diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu
@@ -16,12 +16,12 @@ namespace xgboost {
 namespace tree {
 template <typename GradientSumT>
 void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
-                                           common::Span<FeatureType const> ft, ObjInfo task,
+                                           common::Span<FeatureType const> ft,
                                            bst_feature_t n_features, TrainParam const &param,
                                            int32_t device) {
   param_ = param;
   tree_evaluator_ = TreeEvaluator{param, n_features, device};
-  if (cuts.HasCategorical() && !task.UseOneHot()) {
+  if (cuts.HasCategorical()) {
     dh::XGBCachingDeviceAllocator<char> alloc;
     auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
     auto beg = thrust::make_counting_iterator<size_t>(1ul);
@@ -30,46 +30,45 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
     // This condition avoids sort-based split function calls if the users want
     // onehot-encoding-based splits.
     // For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
-    has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
-      auto idx = i - 1;
-      if (common::IsCat(ft, idx)) {
-        auto n_bins = ptrs[i] - ptrs[idx];
-        bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
-        return use_sort;
-      }
-      return false;
-    });
+    need_sort_histogram_ =
+        thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
+          auto idx = i - 1;
+          if (common::IsCat(ft, idx)) {
+            auto n_bins = ptrs[i] - ptrs[idx];
+            bool use_sort = !common::UseOneHot(n_bins, to_onehot);
+            return use_sort;
+          }
+          return false;
+        });
 
-    if (has_sort_) {
-      auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
-      CHECK_NE(bit_storage_size, 0);
-      // We need to allocate for all nodes since the updater can grow the tree layer by
-      // layer, all nodes in the same layer must be preserved until that layer is
-      // finished.  We can allocate one layer at a time, but the best case is reducing the
-      // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
-      // times than necessary.
-      split_cats_.resize(param.MaxNodes() * bit_storage_size);
-      h_split_cats_.resize(split_cats_.size());
-      dh::safe_cuda(
-          cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
+    node_categorical_storage_size_ =
+        common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
+    CHECK_NE(node_categorical_storage_size_, 0);
+    // We need to allocate for all nodes since the updater can grow the tree layer by
+    // layer, all nodes in the same layer must be preserved until that layer is
+    // finished.  We can allocate one layer at a time, but the best case is reducing the
+    // size of the bitset by about a half, at the cost of invoking CUDA malloc many more
+    // times than necessary.
+    split_cats_.resize(node_categorical_storage_size_);
+    h_split_cats_.resize(node_categorical_storage_size_);
+    dh::safe_cuda(
+        cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
 
-      cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
-      sort_input_.resize(cat_sorted_idx_.size());
+    cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2);  // evaluate 2 nodes at a time.
+    sort_input_.resize(cat_sorted_idx_.size());
 
-      /**
-       * cache feature index binary search result
-       */
-      feature_idx_.resize(cat_sorted_idx_.size());
-      auto d_fidxes = dh::ToSpan(feature_idx_);
-      auto it = thrust::make_counting_iterator(0ul);
-      auto values = cuts.cut_values_.ConstDeviceSpan();
-      auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
-      thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(),
-                        feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) {
-                          auto fidx = dh::SegmentId(ptrs, i);
-                          return fidx;
-                        });
-    }
+    /**
+     * cache feature index binary search result
+     */
+    feature_idx_.resize(cat_sorted_idx_.size());
+    auto d_fidxes = dh::ToSpan(feature_idx_);
+    auto it = thrust::make_counting_iterator(0ul);
+    auto values = cuts.cut_values_.ConstDeviceSpan();
+    thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
+                      [=] XGBOOST_DEVICE(size_t i) {
+                        auto fidx = dh::SegmentId(ptrs, i);
+                        return fidx;
+                      });
   }
 }
 

diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
@@ -244,7 +244,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
         }
         if (is_cat) {
           auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
-          if (common::UseOneHot(n_bins, param_.max_cat_to_onehot, task_)) {
+          if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
             EnumerateSplit<+1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
             EnumerateSplit<-1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
           } else {

diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -197,8 +197,6 @@ struct GPUHistMakerDevice {
   std::unique_ptr<GradientBasedSampler> sampler;
 
   std::unique_ptr<FeatureGroups> feature_groups;
-  // Storing split categories for last node.
-  dh::caching_device_vector<uint32_t> node_categories;
 
   GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
                      common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
@@ -243,7 +241,7 @@ struct GPUHistMakerDevice {
                               param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 
-    this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param,
+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
                            ctx_->gpu_id);
 
     this->interaction_constraints.Reset();
@@ -354,14 +352,14 @@ struct GPUHistMakerDevice {
     return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
   }
 
-  void UpdatePosition(int nidx, RegTree* p_tree) {
-    RegTree::Node split_node = (*p_tree)[nidx];
-    auto split_type = p_tree->NodeSplitType(nidx);
+  void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
+    RegTree::Node split_node = (*p_tree)[e.nid];
+    auto split_type = p_tree->NodeSplitType(e.nid);
     auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
-    auto node_cats = dh::ToSpan(node_categories);
+    auto node_cats = e.split.split_cats.Bits();
 
     row_partitioner->UpdatePosition(
-        nidx, split_node.LeftChild(), split_node.RightChild(),
+        e.nid, split_node.LeftChild(), split_node.RightChild(),
         [=] __device__(bst_uint ridx) {
           // given a row index, returns the node id it belongs to
           bst_float cut_value =
@@ -567,27 +565,12 @@ struct GPUHistMakerDevice {
       CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
           << "Categorical feature value too large.";
       std::vector<uint32_t> split_cats;
-      if (candidate.split.split_cats.Bits().empty()) {
-        if (common::InvalidCat(candidate.split.fvalue)) {
-          common::InvalidCategory();
-        }
-        auto cat = common::AsCat(candidate.split.fvalue);
-        split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0);
-        common::CatBitField cats_bits(split_cats);
-        cats_bits.Set(cat);
-        dh::CopyToD(split_cats, &node_categories);
-      } else {
-        auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
-        auto max_cat = candidate.split.MaxCat();
-        split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
-        CHECK_LE(split_cats.size(), h_cats.size());
-        std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
-
-        node_categories.resize(candidate.split.split_cats.Bits().size());
-        dh::safe_cuda(cudaMemcpyAsync(
-            node_categories.data().get(), candidate.split.split_cats.Data(),
-            candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice));
-      }
+      CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
+      auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
+      auto max_cat = candidate.split.MaxCat();
+      split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
+      CHECK_LE(split_cats.size(), h_cats.size());
+      std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());
 
       tree.ExpandCategorical(
           candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
@@ -674,7 +657,7 @@ struct GPUHistMakerDevice {
           // Update position is only run when child is valid, instead of right after apply
           // split (as in approx tree method).  Hense we have the finalise position call
           // in GPU Hist.
-          this->UpdatePosition(candidate.nid, p_tree);
+          this->UpdatePosition(candidate, p_tree);
           monitor.Stop("UpdatePosition");
 
           monitor.Start("BuildHist");