From 606be9e6639dc3b24a5af1e8a131673f90aabdca Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 24 May 2022 20:48:41 +0800
Subject: [PATCH] Handle missing values in one hot splits. (#7934)

---
 src/tree/hist/evaluate_splits.h | 72 ++++++++++++++++++++++++++++-----
 tests/python/test_updaters.py   | 32 +++++++++++++--
 tests/python/testing.py         | 15 ++++++-
 3 files changed, 105 insertions(+), 14 deletions(-)
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 894edfe325d3..fa6cc718b7f7 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -45,14 +45,72 @@ class HistEvaluator {
   // then - there are no missing values
   // else - there are missing values
   bool static SplitContainsMissingValues(const GradStats e, const NodeEntry &snode) {
-    if (e.GetGrad() == snode.stats.GetGrad() &&
-        e.GetHess() == snode.stats.GetHess()) {
+    if (e.GetGrad() == snode.stats.GetGrad() && e.GetHess() == snode.stats.GetHess()) {
       return false;
     } else {
       return true;
     }
   }
 
+  bool IsValid(GradStats const &left, GradStats const &right) const {
+    return left.GetHess() >= param_.min_child_weight && right.GetHess() >= param_.min_child_weight;
+  }
+
+  /**
+   * \brief Use learned direction with one-hot split. Other implementations (LGB, sklearn)
+   *        create a pseudo-category for missing value but here we just do a complete scan
+   *        to avoid making specialized histogram bin.
+   */
+  void EnumerateOneHot(common::HistogramCuts const &cut, const common::GHistRow &hist,
+                       bst_feature_t fidx, bst_node_t nidx,
+                       TreeEvaluator::SplitEvaluator<TrainParam> const &evaluator,
+                       SplitEntry *p_best) const {
+    const std::vector<uint32_t> &cut_ptr = cut.Ptrs();
+    const std::vector<bst_float> &cut_val = cut.Values();
+
+    bst_bin_t ibegin = static_cast<bst_bin_t>(cut_ptr[fidx]);
+    bst_bin_t iend = static_cast<bst_bin_t>(cut_ptr[fidx + 1]);
+    bst_bin_t n_bins = iend - ibegin;
+
+    GradStats left_sum;
+    GradStats right_sum;
+    // best split so far
+    SplitEntry best;
+
+    auto f_hist = hist.subspan(cut_ptr[fidx], n_bins);
+    auto feature_sum = GradStats{
+        std::accumulate(f_hist.data(), f_hist.data() + f_hist.size(), GradientPairPrecise{})};
+    GradStats missing;
+    auto const &parent = snode_[nidx];
+    missing.SetSubstract(parent.stats, feature_sum);
+
+    for (bst_bin_t i = ibegin; i != iend; i += 1) {
+      auto split_pt = cut_val[i];
+
+      // missing on left (treat missing as other categories)
+      right_sum = GradStats{hist[i]};
+      left_sum.SetSubstract(parent.stats, right_sum);
+      if (IsValid(left_sum, right_sum)) {
+        auto missing_left_chg = static_cast<float>(
+            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
+            parent.root_gain);
+        best.Update(missing_left_chg, fidx, split_pt, true, true, left_sum, right_sum);
+      }
+
+      // missing on right (treat missing as chosen category)
+      left_sum.SetSubstract(left_sum, missing);
+      right_sum.Add(missing);
+      if (IsValid(left_sum, right_sum)) {
+        auto missing_right_chg = static_cast<float>(
+            evaluator.CalcSplitGain(param_, nidx, fidx, GradStats{left_sum}, GradStats{right_sum}) -
+            parent.root_gain);
+        best.Update(missing_right_chg, fidx, split_pt, false, true, left_sum, right_sum);
+      }
+    }
+
+    p_best->Update(best);
+  }
+
   // Enumerate/Scan the split values of specific feature
   // Returns the sum of gradients corresponding to the data points that contains
   // a non-missing value for the particular feature fid.
@@ -102,9 +160,7 @@ class HistEvaluator {
           break;
         }
         case kOneHot: {
-          // not-chosen categories go to left
-          right_sum = GradStats{hist[i]};
-          left_sum.SetSubstract(parent.stats, right_sum);
+          std::terminate();  // unreachable
           break;
         }
         case kPart: {
@@ -151,7 +207,7 @@ class HistEvaluator {
               break;
             }
             case kOneHot: {
-              split_pt = cut_val[i];
+              std::terminate();  // unreachable
               break;
             }
             case kPart: {
@@ -188,7 +244,6 @@ class HistEvaluator {
         // Normal, accumulated to left
         return left_sum;
       case kOneHot:
-        // Doesn't matter, not accumulating.
         return {};
       case kPart:
         // Accumulated to right due to chosen cats go to right.
@@ -242,8 +297,7 @@ class HistEvaluator {
         if (is_cat) {
           auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
           if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
-            EnumerateSplit<+1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
-            EnumerateSplit<-1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
+            EnumerateOneHot(cut, histogram, fidx, nidx, evaluator, best);
           } else {
             std::vector<size_t> sorted_idx(n_bins);
             std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index 251439cdfed6..b4cc1eb310bb 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -214,6 +214,9 @@ def test_max_cat(self, tree_method) -> None:
         self.run_max_cat(tree_method)
 
     def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
+        USE_ONEHOT = np.iinfo(np.int32).max
+        USE_PART = 1
+
         onehot, label = tm.make_categorical(rows, cols, cats, True)
         cat, _ = tm.make_categorical(rows, cols, cats, False)
 
@@ -221,10 +224,9 @@ def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
         by_builtin_results = {}
 
         predictor = "gpu_predictor" if tree_method == "gpu_hist" else None
+        parameters = {"tree_method": tree_method, "predictor": predictor}
         # Use one-hot exclusively
-        parameters = {
-            "tree_method": tree_method, "predictor": predictor, "max_cat_to_onehot": 9999
-        }
+        parameters["max_cat_to_onehot"] = USE_ONEHOT
 
         m = xgb.DMatrix(onehot, label, enable_categorical=False)
         xgb.train(
@@ -257,7 +259,8 @@ def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
         assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
 
         by_grouping: xgb.callback.TrainingCallback.EvalsLog = {}
-        parameters["max_cat_to_onehot"] = 1
+        # switch to partition-based splits
+        parameters["max_cat_to_onehot"] = USE_PART
         parameters["reg_lambda"] = 0
         m = xgb.DMatrix(cat, label, enable_categorical=True)
         xgb.train(
@@ -284,6 +287,27 @@ def run_categorical_basic(self, rows, cols, rounds, cats, tree_method):
         )
         assert tm.non_increasing(by_grouping["Train"]["rmse"]), by_grouping
 
+        # test with missing values
+        cat, label = tm.make_categorical(
+            n_samples=256, n_features=4, n_categories=8, onehot=False, sparsity=0.5
+        )
+        Xy = xgb.DMatrix(cat, label, enable_categorical=True)
+        evals_result = {}
+        # Test with onehot splits
+        parameters["max_cat_to_onehot"] = USE_ONEHOT
+        booster = xgb.train(
+            parameters,
+            Xy,
+            num_boost_round=16,
+            evals=[(Xy, "Train")],
+            evals_result=evals_result
+        )
+        assert tm.non_increasing(evals_result["Train"]["rmse"])
+        y_predt = booster.predict(Xy)
+
+        rmse = tm.root_mean_square(label, y_predt)
+        np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1])
+
     @given(strategies.integers(10, 400), strategies.integers(3, 8),
            strategies.integers(1, 2), strategies.integers(4, 7))
     @settings(deadline=None, print_blob=True)
diff --git a/tests/python/testing.py b/tests/python/testing.py
index 1cfaca876e87..cf723e12b0b6 100644
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -302,7 +302,7 @@ def get_mq2008(dpath):
 
 @memory.cache
 def make_categorical(
-    n_samples: int, n_features: int, n_categories: int, onehot: bool
+    n_samples: int, n_features: int, n_categories: int, onehot: bool, sparsity=0.0,
 ):
     import pandas as pd
 
@@ -325,6 +325,13 @@ def make_categorical(
     for col in df.columns:
         df[col] = df[col].cat.set_categories(categories)
 
+    if sparsity > 0.0:
+        for i in range(n_features):
+            index = rng.randint(low=0, high=n_samples-1, size=int(n_samples * sparsity))
+            df.iloc[index, i] = np.NaN
+            assert df.iloc[:, i].isnull().values.any()
+            assert n_categories == np.unique(df.dtypes[i].categories).size
+
     if onehot:
         return pd.get_dummies(df), label
     return df, label
@@ -538,6 +545,12 @@ def eval_error_metric_skl(y_true: np.ndarray, y_score: np.ndarray) -> float:
     return np.sum(r)
 
 
+def root_mean_square(y_true: np.ndarray, y_score: np.ndarray) -> float:
+    err = y_score - y_true
+    rmse = np.sqrt(np.dot(err, err) / y_score.size)
+    return rmse
+
+
 def softmax(x):
     e = np.exp(x)
     return e / np.sum(e)