CPU evaluation for cat data. (#7393)

* Implementation for one hot based. * Implementation for partition based. (LightGBM)
dmlc · Nov 6, 2021 · d7d1b6e · d7d1b6e
1 parent 6ede124
commit d7d1b6e
Show file tree

Hide file tree

Showing 15 changed files with 540 additions and 166 deletions.
diff --git a/src/common/categorical.h b/src/common/categorical.h
@@ -5,11 +5,12 @@
 #ifndef XGBOOST_COMMON_CATEGORICAL_H_
 #define XGBOOST_COMMON_CATEGORICAL_H_
 
+#include "bitfield.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
-#include "xgboost/span.h"
 #include "xgboost/parameter.h"
-#include "bitfield.h"
+#include "xgboost/span.h"
+#include "xgboost/task.h"
 
 namespace xgboost {
 namespace common {
@@ -47,6 +48,15 @@ inline void InvalidCategory() {
                 "should be non-negative.";
 }
 
+/*!
+ * \brief Whether should we use onehot encoding for categorical data.
+ */
+inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
+  bool use_one_hot = n_cats < max_cat_to_onehot ||
+                     (task.task != ObjInfo::kRegression && task.task != ObjInfo::kBinary);
+  return use_one_hot;
+}
+
 struct IsCatOp {
   XGBOOST_DEVICE bool operator()(FeatureType ft) {
     return ft == FeatureType::kCategorical;

diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
diff --git a/src/tree/param.h b/src/tree/param.h
@@ -1,12 +1,13 @@
 /*!
- * Copyright 2014-2019 by Contributors
+ * Copyright 2014-2021 by Contributors
  * \file param.h
  * \brief training parameters, statistics used to support tree construction.
  * \author Tianqi Chen
  */
 #ifndef XGBOOST_TREE_PARAM_H_
 #define XGBOOST_TREE_PARAM_H_
 
+#include <algorithm>
 #include <cmath>
 #include <cstring>
 #include <limits>
@@ -15,6 +16,7 @@
 
 #include "xgboost/parameter.h"
 #include "xgboost/data.h"
+#include "../common/categorical.h"
 #include "../common/math.h"
 
 namespace xgboost {
@@ -36,6 +38,8 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
   enum TreeGrowPolicy { kDepthWise = 0, kLossGuide = 1 };
   int grow_policy;
 
+  uint32_t max_cat_to_onehot{1};
+
   //----- the rest parameters are less important ----
   // minimum amount of hessian(weight) allowed in a child
   float min_child_weight;
@@ -119,6 +123,10 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
             "Tree growing policy. 0: favor splitting at nodes closest to the node, "
             "i.e. grow depth-wise. 1: favor splitting at nodes with highest loss "
             "change. (cf. LightGBM)");
+    DMLC_DECLARE_FIELD(max_cat_to_onehot)
+        .set_default(4)
+        .set_lower_bound(1)
+        .describe("Maximum number of categories to use one-hot encoding based split.");
     DMLC_DECLARE_FIELD(min_child_weight)
         .set_lower_bound(0.0f)
         .set_default(1.0f)
@@ -384,6 +392,8 @@ struct SplitEntryContainer {
   /*! \brief split index */
   bst_feature_t sindex{0};
   bst_float split_value{0.0f};
+  std::vector<uint32_t> cat_bits;
+  bool is_cat{false};
 
   GradientT left_sum;
   GradientT right_sum;
@@ -433,6 +443,8 @@ struct SplitEntryContainer {
       this->loss_chg = e.loss_chg;
       this->sindex = e.sindex;
       this->split_value = e.split_value;
+      this->is_cat = e.is_cat;
+      this->cat_bits = e.cat_bits;
       this->left_sum = e.left_sum;
       this->right_sum = e.right_sum;
       return true;
@@ -449,16 +461,40 @@ struct SplitEntryContainer {
    * \return whether the proposed split is better and can replace current split
    */
   bool Update(bst_float new_loss_chg, unsigned split_index,
-              bst_float new_split_value, bool default_left,
-              const GradientT &left_sum,
-              const GradientT &right_sum) {
+              bst_float new_split_value, bool default_left, bool is_cat,
+              const GradientT &left_sum, const GradientT &right_sum) {
     if (this->NeedReplace(new_loss_chg, split_index)) {
       this->loss_chg = new_loss_chg;
       if (default_left) {
         split_index |= (1U << 31);
       }
       this->sindex = split_index;
       this->split_value = new_split_value;
+      this->is_cat = is_cat;
+      this->left_sum = left_sum;
+      this->right_sum = right_sum;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /*!
+   * \brief Update with partition based categorical split.
+   *
+   * \return Whether the proposed split is better and can replace current split.
+   */
+  bool Update(float new_loss_chg, bst_feature_t split_index, common::KCatBitField cats,
+              bool default_left, GradientT const &left_sum, GradientT const &right_sum) {
+    if (this->NeedReplace(new_loss_chg, split_index)) {
+      this->loss_chg = new_loss_chg;
+      if (default_left) {
+        split_index |= (1U << 31);
+      }
+      this->sindex = split_index;
+      cat_bits.resize(cats.Bits().size());
+      std::copy(cats.Bits().begin(), cats.Bits().end(), cat_bits.begin());
+      this->is_cat = true;
       this->left_sum = left_sum;
       this->right_sum = right_sum;
       return true;

diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h
@@ -92,7 +92,7 @@ class TreeEvaluator {
 
     XGBOOST_DEVICE float CalcWeight(bst_node_t nodeid, const ParamT &param,
                                     tree::GradStats const& stats) const {
-      float w = xgboost::tree::CalcWeight(param, stats);
+      float w = ::xgboost::tree::CalcWeight(param, stats);
       if (!has_constraint) {
         return w;
       }
@@ -107,6 +107,12 @@ class TreeEvaluator {
         return w;
       }
     }
+
+    template <typename GradientSumT>
+    XGBOOST_DEVICE double CalcWeightCat(ParamT const& param, GradientSumT const& stats) const {
+      return ::xgboost::tree::CalcWeight(param, stats);
+    }
+
     XGBOOST_DEVICE float
     CalcGainGivenWeight(ParamT const &p, tree::GradStats const& stats, float w) const {
       if (stats.GetHess() <= 0) {

diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
@@ -336,10 +336,10 @@ class ColMaker: public TreeUpdater {
               bst_float proposed_split = (fvalue + e.last_fvalue) * 0.5f;
               if ( proposed_split == fvalue ) {
                 e.best.Update(loss_chg, fid, e.last_fvalue,
-                              d_step == -1, c, e.stats);
+                              d_step == -1, false, c, e.stats);
               } else {
                 e.best.Update(loss_chg, fid, proposed_split,
-                              d_step == -1, c, e.stats);
+                              d_step == -1, false, c, e.stats);
               }
             } else {
               loss_chg = static_cast<bst_float>(
@@ -348,10 +348,10 @@ class ColMaker: public TreeUpdater {
               bst_float proposed_split = (fvalue + e.last_fvalue) * 0.5f;
               if ( proposed_split == fvalue ) {
                 e.best.Update(loss_chg, fid, e.last_fvalue,
-                            d_step == -1, e.stats, c);
+                              d_step == -1, false, e.stats, c);
               } else {
                 e.best.Update(loss_chg, fid, proposed_split,
-                            d_step == -1, e.stats, c);
+                              d_step == -1, false, e.stats, c);
               }
             }
           }
@@ -430,14 +430,14 @@ class ColMaker: public TreeUpdater {
             loss_chg = static_cast<bst_float>(
                 evaluator.CalcSplitGain(param_, nid, fid, c, e.stats) -
                 snode_[nid].root_gain);
-            e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1, c,
-                          e.stats);
+            e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1,
+                          false, c, e.stats);
           } else {
             loss_chg = static_cast<bst_float>(
                 evaluator.CalcSplitGain(param_, nid, fid, e.stats, c) -
                 snode_[nid].root_gain);
             e.best.Update(loss_chg, fid, e.last_fvalue + delta, d_step == -1,
-                          e.stats, c);
+                          false, e.stats, c);
           }
         }
       }

diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
@@ -173,7 +173,8 @@ class HistMaker: public BaseMaker {
         if (c.sum_hess >= param_.min_child_weight) {
           double loss_chg = CalcGain(param_, s.GetGrad(), s.GetHess()) +
                             CalcGain(param_, c.GetGrad(), c.GetHess()) - root_gain;
-          if (best->Update(static_cast<bst_float>(loss_chg), fid, hist.cut[i], false, s, c)) {
+          if (best->Update(static_cast<bst_float>(loss_chg), fid, hist.cut[i],
+                           false, false, s, c)) {
             *left_sum = s;
           }
         }
@@ -187,7 +188,8 @@ class HistMaker: public BaseMaker {
         if (c.sum_hess >= param_.min_child_weight) {
           double loss_chg = CalcGain(param_, s.GetGrad(), s.GetHess()) +
                             CalcGain(param_, c.GetGrad(), c.GetHess()) - root_gain;
-          if (best->Update(static_cast<bst_float>(loss_chg), fid, hist.cut[i-1], true, c, s)) {
+          if (best->Update(static_cast<bst_float>(loss_chg), fid,
+                           hist.cut[i - 1], true, false, c, s)) {
             *left_sum = c;
           }
         }

diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
@@ -168,9 +168,11 @@ void QuantileHistMaker::Builder<GradientSumT>::InitRoot(
 
     std::vector<CPUExpandEntry> entries{node};
     builder_monitor_.Start("EvaluateSplits");
+    auto ft = p_fmat->Info().feature_types.ConstHostSpan();
     for (auto const &gmat : p_fmat->GetBatches<GHistIndexMatrix>(
              BatchParam{GenericParameter::kCpuId, param_.max_bin})) {
-      evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, *p_tree, &entries);
+      evaluator_->EvaluateSplits(histogram_builder_->Histogram(), gmat.cut, ft,
+                                 *p_tree, &entries);
       break;
     }
     builder_monitor_.Stop("EvaluateSplits");
@@ -272,8 +274,9 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
       }
 
       builder_monitor_.Start("EvaluateSplits");
-      evaluator_->EvaluateSplits(this->histogram_builder_->Histogram(), gmat.cut,
-                                 *p_tree, &nodes_to_evaluate);
+      auto ft = p_fmat->Info().feature_types.ConstHostSpan();
+      evaluator_->EvaluateSplits(this->histogram_builder_->Histogram(),
+                                 gmat.cut, ft, *p_tree, &nodes_to_evaluate);
       builder_monitor_.Stop("EvaluateSplits");
 
       for (size_t i = 0; i < nodes_for_apply_split.size(); ++i) {
@@ -529,11 +532,11 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(
   // store a pointer to the tree
   p_last_tree_ = &tree;
   if (data_layout_ == DataLayout::kDenseDataOneBased) {
-    evaluator_.reset(new HistEvaluator<GradientSumT, CPUExpandEntry>{param_, info, this->nthread_,
-                                                                     column_sampler_, true});
+    evaluator_.reset(new HistEvaluator<GradientSumT, CPUExpandEntry>{
+        param_, info, this->nthread_, column_sampler_, task_, true});
   } else {
-    evaluator_.reset(new HistEvaluator<GradientSumT, CPUExpandEntry>{param_, info, this->nthread_,
-                                                                     column_sampler_, false});
+    evaluator_.reset(new HistEvaluator<GradientSumT, CPUExpandEntry>{
+        param_, info, this->nthread_, column_sampler_, task_, false});
   }
 
   if (data_layout_ == DataLayout::kDenseDataZeroBased

diff --git a/tests/cpp/categorical_helpers.h b/tests/cpp/categorical_helpers.h
@@ -0,0 +1,44 @@
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ *
+ * \brief Utilities for testing categorical data support.
+ */
+#include <numeric>
+#include <vector>
+
+#include "xgboost/span.h"
+#include "helpers.h"
+#include "../../src/common/categorical.h"
+
+namespace xgboost {
+inline std::vector<float> OneHotEncodeFeature(std::vector<float> x,
+                                              size_t num_cat) {
+  std::vector<float> ret(x.size() * num_cat, 0);
+  size_t n_rows = x.size();
+  for (size_t r = 0; r < n_rows; ++r) {
+    bst_cat_t cat = common::AsCat(x[r]);
+    ret.at(num_cat * r + cat) = 1;
+  }
+  return ret;
+}
+
+template <typename GradientSumT>
+void ValidateCategoricalHistogram(size_t n_categories,
+                                  common::Span<GradientSumT> onehot,
+                                  common::Span<GradientSumT> cat) {
+  auto cat_sum = std::accumulate(cat.cbegin(), cat.cend(), GradientPairPrecise{});
+  for (size_t c = 0; c < n_categories; ++c) {
+    auto zero = onehot[c * 2];
+    auto one = onehot[c * 2 + 1];
+
+    auto chosen = cat[c];
+    auto not_chosen = cat_sum - chosen;
+
+    ASSERT_LE(RelError(zero.GetGrad(), not_chosen.GetGrad()), kRtEps);
+    ASSERT_LE(RelError(zero.GetHess(), not_chosen.GetHess()), kRtEps);
+
+    ASSERT_LE(RelError(one.GetGrad(), chosen.GetGrad()), kRtEps);
+    ASSERT_LE(RelError(one.GetHess(), chosen.GetHess()), kRtEps);
+  }
+}
+} // namespace xgboost
diff --git a/tests/cpp/common/test_quantile.cu b/tests/cpp/common/test_quantile.cu
@@ -5,6 +5,13 @@
 #include "../../../src/common/quantile.cuh"
 
 namespace xgboost {
+namespace {
+struct IsSorted {
+  XGBOOST_DEVICE bool operator()(common::SketchEntry const& a, common::SketchEntry const& b) const {
+    return a.value < b.value;
+  }
+};
+}
 namespace common {
 TEST(GPUQuantile, Basic) {
   constexpr size_t kRows = 1000, kCols = 100, kBins = 256;
@@ -52,9 +59,15 @@ void TestSketchUnique(float sparsity) {
     ASSERT_EQ(sketch.Data().size(), h_columns_ptr.back());
 
     sketch.Unique();
-    ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch.Data().data(),
-                                  sketch.Data().data() + sketch.Data().size(),
-                                  detail::SketchUnique{}));
+
+    std::vector<SketchEntry> h_data(sketch.Data().size());
+    thrust::copy(dh::tcbegin(sketch.Data()), dh::tcend(sketch.Data()), h_data.begin());
+
+    for (size_t i = 1; i < h_columns_ptr.size(); ++i) {
+      auto begin = h_columns_ptr[i - 1];
+      auto column = common::Span<SketchEntry>(h_data).subspan(begin, h_columns_ptr[i] - begin);
+      ASSERT_TRUE(std::is_sorted(column.begin(), column.end(), IsSorted{}));
+    }
   });
 }
 
@@ -84,8 +97,7 @@ void TestQuantileElemRank(int32_t device, Span<SketchEntry const> in,
       if (with_error) {
         ASSERT_GE(in_column[idx].rmin + in_column[idx].rmin * kRtEps,
                   prev_rmin);
-        ASSERT_GE(in_column[idx].rmax + in_column[idx].rmin * kRtEps,
-                  prev_rmax);
+        ASSERT_GE(in_column[idx].rmax + in_column[idx].rmin * kRtEps, prev_rmax);
         ASSERT_GE(in_column[idx].rmax + in_column[idx].rmin * kRtEps,
                   rmin_next);
       } else {
@@ -169,7 +181,7 @@ TEST(GPUQuantile, MergeEmpty) {
 
 TEST(GPUQuantile, MergeBasic) {
   constexpr size_t kRows = 1000, kCols = 100;
-  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const& info) {
+  RunWithSeedsAndBins(kRows, [=](int32_t seed, size_t n_bins, MetaInfo const &info) {
     HostDeviceVector<FeatureType> ft;
     SketchContainer sketch_0(ft, n_bins, kCols, kRows, 0);
     HostDeviceVector<float> storage_0;
@@ -265,9 +277,16 @@ void TestMergeDuplicated(int32_t n_bins, size_t cols, size_t rows, float frac) {
   ASSERT_EQ(h_columns_ptr.back(), sketch_1.Data().size() + size_before_merge);
 
   sketch_0.Unique();
-  ASSERT_TRUE(thrust::is_sorted(thrust::device, sketch_0.Data().data(),
-                                sketch_0.Data().data() + sketch_0.Data().size(),
-                                detail::SketchUnique{}));
+  columns_ptr = sketch_0.ColumnsPtr();
+  dh::CopyDeviceSpanToVector(&h_columns_ptr, columns_ptr);
+
+  std::vector<SketchEntry> h_data(sketch_0.Data().size());
+  dh::CopyDeviceSpanToVector(&h_data, sketch_0.Data());
+  for (size_t i = 1; i < h_columns_ptr.size(); ++i) {
+    auto begin = h_columns_ptr[i - 1];
+    auto column = Span<SketchEntry> {h_data}.subspan(begin, h_columns_ptr[i] - begin);
+    ASSERT_TRUE(std::is_sorted(column.begin(), column.end(), IsSorted{}));
+  }
 }
 
 TEST(GPUQuantile, MergeDuplicated) {

diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h
@@ -48,7 +48,9 @@ template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
   std::vector<MetaInfo> infos(2);
   auto& h_weights = infos.front().weights_.HostVector();
   h_weights.resize(rows);
-  std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); });
+
+  SimpleRealUniformDistribution<float> weight_dist(0, 10);
+  std::generate(h_weights.begin(), h_weights.end(), [&]() { return weight_dist(&lcg); });
 
   for (auto seed : seeds) {
     for (auto n_bin : bins) {