From 9f68393676669f2df62d90f2f8ac92b6c453e232 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 30 Aug 2022 03:36:17 +0800 Subject: [PATCH 01/19] Initial commit. --- src/tree/gpu_hist/evaluate_splits.cu | 83 +++++++++++++++++---------- src/tree/hist/evaluate_splits.h | 2 +- tests/python-gpu/test_gpu_updaters.py | 3 +- tests/python/test_updaters.py | 5 +- 4 files changed, 56 insertions(+), 37 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 908e7b01972c..3d6fa2dfa024 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -167,35 +167,34 @@ class EvaluateSplitAgent { } } } + __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split, - bst_feature_t * __restrict__ sorted_idx, - std::size_t offset) { + bst_feature_t *__restrict__ sorted_idx, + std::size_t offset) { for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) { bool thread_active = (scan_begin + threadIdx.x) < gidx_end; - auto rest = thread_active - ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset) - : GradientPairPrecise(); + auto right_sum = + thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset) + : GradientPairPrecise(); // No min value for cat feature, use inclusive scan. - BlockScanT(temp_storage->scan).InclusiveSum(rest, rest, prefix_op); - GradientPairPrecise bin = parent_sum - rest - missing; + BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op); + GradientPairPrecise left_sum = parent_sum - right_sum - missing; // Whether the gradient of missing values is put to the left side. bool missing_left = true; - float gain = thread_active ? LossChangeMissing(bin, missing, parent_sum, param, nidx, fidx, - evaluator, missing_left) + float gain = thread_active ? LossChangeMissing(left_sum, missing, parent_sum, param, nidx, + fidx, evaluator, missing_left) : kNullGain; - // Find thread with best gain - auto best = - MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); // This reduce result is only valid in thread 0 // broadcast to the rest of the warp auto best_thread = __shfl_sync(0xffffffff, best.key, 0); // Best thread updates the split if (threadIdx.x == best_thread) { - GradientPairPrecise left = missing_left ? bin + missing : bin; + GradientPairPrecise left = missing_left ? left_sum + missing : left_sum; GradientPairPrecise right = parent_sum - left; auto best_thresh = threadIdx.x + (scan_begin - gidx_begin); // index of best threshold inside a feature. @@ -277,32 +276,54 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu return; } + // partition-based split auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx, shared_inputs.feature_values.size()); size_t node_offset = input_idx * shared_inputs.feature_values.size(); auto best_thresh = out_split.PopBestThresh(); auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], shared_inputs.FeatureBins(fidx)); - if (out_split.dir != kLeftDir) { - // forward, missing on right - auto beg = dh::tcbegin(f_sorted_idx); - // Don't put all the categories into one side - auto boundary = std::min(static_cast((best_thresh + 1)), (f_sorted_idx.size() - 1)); - boundary = std::max(boundary, static_cast(1ul)); - auto end = beg + boundary; - thrust::for_each(thrust::seq, beg, end, [&](auto c) { - auto cat = shared_inputs.feature_values[c - node_offset]; - assert(!out_split.split_cats.Check(cat) && "already set"); - out_split.SetCat(cat); - }); + bool forward = out_split.dir == kLeftDir; + auto cut_ptr = shared_inputs.feature_segments; + + auto n_bins = shared_inputs.FeatureBins(fidx); + bst_bin_t ibegin, iend; + bst_bin_t f_begin = cut_ptr[fidx]; + if (forward) { + ibegin = f_begin; + iend = ibegin + n_bins - 1; } else { - assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0"); - thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx), - dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) { - auto cat = shared_inputs.feature_values[c - node_offset]; - out_split.SetCat(cat); - }); + ibegin = static_cast(cut_ptr[fidx + 1]) - 1; + iend = ibegin - n_bins + 1; } + + bst_bin_t partition = forward ? (best_thresh - ibegin + 1) : (best_thresh - f_begin); + auto beg = dh::tcbegin(f_sorted_idx); + thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) { + auto cat = shared_inputs.feature_values[c - node_offset]; + out_split.SetCat(cat); + }); + + // if (out_split.dir != kLeftDir) { + // // backward, missing on right + // auto beg = dh::tcbegin(f_sorted_idx); + // // Don't put all the categories into one side + // auto boundary = std::min(static_cast((best_thresh + 1)), (f_sorted_idx.size() - 1)); + // boundary = std::max(boundary, static_cast(1ul)); + // auto end = beg + boundary; + // thrust::for_each(thrust::seq, beg, end, [&](auto c) { + // auto cat = shared_inputs.feature_values[c - node_offset]; + // assert(!out_split.split_cats.Check(cat) && "already set"); + // out_split.SetCat(cat); + // }); + // } else { + // assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0"); + // thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx), + // dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) { + // auto cat = shared_inputs.feature_values[c - node_offset]; + // out_split.SetCat(cat); + // }); + // } } void GPUHistEvaluator::LaunchEvaluateSplits( diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index 002728be365f..adb7e54decf9 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -193,7 +193,7 @@ class HistEvaluator { bst_bin_t partition = d_step == 1 ? (best_thresh - ibegin + 1) : (best_thresh - f_begin); CHECK_GT(partition, 0); std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, - [&](size_t c) { cat_bits.Set(c); }); + [&](size_t c) { cat_bits.Set(c); }); // fixme: cut_values[c] } p_best->Update(best); diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index b2f8b1a27050..0b086f0e9282 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -3,7 +3,7 @@ import gc import pytest import xgboost as xgb -from hypothesis import given, strategies, assume, settings, note +from hypothesis import given, strategies, assume, settings, note, reproduce_failure sys.path.append("tests/python") import testing as tm @@ -84,6 +84,7 @@ def test_categorical_ohe(self, rows, cols, rounds, cats): ) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) + @reproduce_failure('6.47.1', b'AAAAAAA=') def test_categorical_missing(self, rows, cols, cats): self.cputest.run_categorical_missing(rows, cols, cats, "gpu_hist") diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 93dfb45c6b45..881439292a06 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -257,10 +257,7 @@ def run(max_cat_to_onehot: int): np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1]) # Test with OHE split - run(self.USE_ONEHOT) - - if tree_method == "gpu_hist": # fixme: Test with GPU. - return + # run(self.USE_ONEHOT) # Test with partition-based split run(self.USE_PART) From c33d9fc270488bf7196ee5d612cd956925673380 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 30 Aug 2022 17:48:41 +0800 Subject: [PATCH 02/19] Small cleanup to CPU. --- src/tree/hist/evaluate_splits.h | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index adb7e54decf9..644eadc30f81 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -144,7 +144,10 @@ class HistEvaluator { auto const &cut_ptr = cut.Ptrs(); auto const &parent = snode_[nidx]; - bst_bin_t n_bins_feature{static_cast(cut_ptr[fidx + 1] - cut_ptr[fidx])}; + + bst_bin_t f_begin = cut_ptr[fidx]; + bst_bin_t f_end = cut_ptr[fidx + 1]; + bst_bin_t n_bins_feature{f_end - f_begin}; auto n_bins = std::min(param_.max_cat_threshold, n_bins_feature); // statistics on both sides of split @@ -153,19 +156,18 @@ class HistEvaluator { // best split so far SplitEntry best; - auto f_hist = hist.subspan(cut_ptr[fidx], n_bins_feature); - bst_bin_t ibegin, iend; - bst_bin_t f_begin = cut_ptr[fidx]; + auto f_hist = hist.subspan(f_begin, n_bins_feature); + bst_bin_t it_begin, it_end; if (d_step > 0) { - ibegin = f_begin; - iend = ibegin + n_bins - 1; + it_begin = f_begin; + it_end = it_begin + n_bins - 1; } else { - ibegin = static_cast(cut_ptr[fidx + 1]) - 1; - iend = ibegin - n_bins + 1; + it_begin = f_end - 1; + it_end = it_begin - n_bins + 1; } bst_bin_t best_thresh{-1}; - for (bst_bin_t i = ibegin; i != iend; i += d_step) { + for (bst_bin_t i = it_begin; i != it_end; i += d_step) { auto j = i - f_begin; // index local to current feature if (d_step == 1) { right_sum.Add(f_hist[sorted_idx[j]].GetGrad(), f_hist[sorted_idx[j]].GetHess()); @@ -190,7 +192,7 @@ class HistEvaluator { auto n = common::CatBitField::ComputeStorageSize(n_bins_feature + 1); best.cat_bits = decltype(best.cat_bits)(n, 0); common::CatBitField cat_bits{best.cat_bits}; - bst_bin_t partition = d_step == 1 ? (best_thresh - ibegin + 1) : (best_thresh - f_begin); + bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin); CHECK_GT(partition, 0); std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) { cat_bits.Set(c); }); // fixme: cut_values[c] From 8397bb0d52974985461239a05fd1f1b3c529384e Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 30 Aug 2022 19:14:46 +0800 Subject: [PATCH 03/19] Specialize on cat update. --- src/tree/gpu_hist/evaluate_splits.cu | 73 +++++++++++---------------- src/tree/gpu_hist/evaluate_splits.cuh | 3 +- src/tree/updater_gpu_common.cuh | 39 ++++++++------ src/tree/updater_gpu_hist.cu | 12 +++-- 4 files changed, 62 insertions(+), 65 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 3d6fa2dfa024..077cd795d68e 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -43,9 +43,9 @@ class EvaluateSplitAgent { public: using ArgMaxT = cub::KeyValuePair; using BlockScanT = cub::BlockScan; - using MaxReduceT = - cub::WarpReduce; + using MaxReduceT = cub::WarpReduce; using SumReduceT = cub::WarpReduce; + struct TempStorage { typename BlockScanT::TempStorage scan; typename MaxReduceT::TempStorage max_reduce; @@ -159,11 +159,10 @@ class EvaluateSplitAgent { if (threadIdx.x == best_thread) { int32_t split_gidx = (scan_begin + threadIdx.x); float fvalue = feature_values[split_gidx]; - GradientPairPrecise left = - missing_left ? bin + missing : bin; + GradientPairPrecise left = missing_left ? bin + missing : bin; GradientPairPrecise right = parent_sum - left; - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, - true, param); + best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir, + static_cast(fvalue), fidx, left, right, param); } } } @@ -171,7 +170,7 @@ class EvaluateSplitAgent { __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split, bst_feature_t *__restrict__ sorted_idx, std::size_t offset) { - for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) { + for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) { bool thread_active = (scan_begin + threadIdx.x) < gidx_end; auto right_sum = @@ -187,6 +186,18 @@ class EvaluateSplitAgent { fidx, evaluator, missing_left) : kNullGain; + auto const thresh = threadIdx.x + scan_begin; + auto const &forward = missing_left; + auto const &backward = !forward; + // prevent splitting on missing value alone, skip the last bin if it's forward scan + // and skip the first bin if it's backward + auto const first_bin = gidx_begin; + auto const last_bin = gidx_end - 1; + bool const should_skip = (forward && thresh == last_bin) || (backward && thresh == first_bin); + if (should_skip) { + gain = kNullGain; + } + // Find thread with best gain auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); // This reduce result is only valid in thread 0 @@ -196,10 +207,10 @@ class EvaluateSplitAgent { if (threadIdx.x == best_thread) { GradientPairPrecise left = missing_left ? left_sum + missing : left_sum; GradientPairPrecise right = parent_sum - left; - auto best_thresh = - threadIdx.x + (scan_begin - gidx_begin); // index of best threshold inside a feature. - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, - right, true, param); + // index of best threshold inside a feature. + auto best_thresh = threadIdx.x + (scan_begin - gidx_begin); + auto dft_dir = missing_left ? kLeftDir : kRightDir; + best_split->UpdateCat(gain, dft_dir, best_thresh, fidx, left, right, param); } } } @@ -272,7 +283,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu // Simple case for one hot split if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) { - out_split.split_cats.Set(common::AsCat(out_split.fvalue)); + out_split.split_cats.Set(common::AsCat(out_split.thresh)); return; } @@ -280,50 +291,26 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx, shared_inputs.feature_values.size()); size_t node_offset = input_idx * shared_inputs.feature_values.size(); - auto best_thresh = out_split.PopBestThresh(); + auto best_thresh = out_split.thresh; + if (best_thresh == -1) { + return; + } + auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], shared_inputs.FeatureBins(fidx)); bool forward = out_split.dir == kLeftDir; auto cut_ptr = shared_inputs.feature_segments; auto n_bins = shared_inputs.FeatureBins(fidx); - bst_bin_t ibegin, iend; bst_bin_t f_begin = cut_ptr[fidx]; - if (forward) { - ibegin = f_begin; - iend = ibegin + n_bins - 1; - } else { - ibegin = static_cast(cut_ptr[fidx + 1]) - 1; - iend = ibegin - n_bins + 1; - } - bst_bin_t partition = forward ? (best_thresh - ibegin + 1) : (best_thresh - f_begin); + bst_bin_t partition = forward ? best_thresh + 1 : best_thresh; auto beg = dh::tcbegin(f_sorted_idx); + assert(partition > 0 && "Invalid partition."); thrust::for_each(thrust::seq, beg, beg + partition, [&](size_t c) { auto cat = shared_inputs.feature_values[c - node_offset]; out_split.SetCat(cat); }); - - // if (out_split.dir != kLeftDir) { - // // backward, missing on right - // auto beg = dh::tcbegin(f_sorted_idx); - // // Don't put all the categories into one side - // auto boundary = std::min(static_cast((best_thresh + 1)), (f_sorted_idx.size() - 1)); - // boundary = std::max(boundary, static_cast(1ul)); - // auto end = beg + boundary; - // thrust::for_each(thrust::seq, beg, end, [&](auto c) { - // auto cat = shared_inputs.feature_values[c - node_offset]; - // assert(!out_split.split_cats.Check(cat) && "already set"); - // out_split.SetCat(cat); - // }); - // } else { - // assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0"); - // thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx), - // dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) { - // auto cat = shared_inputs.feature_values[c - node_offset]; - // out_split.SetCat(cat); - // }); - // } } void GPUHistEvaluator::LaunchEvaluateSplits( diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 3e3e51b8e7c2..6b50de0ce3de 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -141,7 +141,8 @@ class GPUHistEvaluator { */ common::Span GetHostNodeCats(bst_node_t nidx) const { copy_stream_.View().Sync(); - auto cats_out = common::Span{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_); + auto cats_out = common::Span{h_split_cats_}.subspan( + nidx * node_categorical_storage_size_, node_categorical_storage_size_); return cats_out; } /** diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index c7c81e964848..b9a31dd32477 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -57,6 +57,9 @@ struct DeviceSplitCandidate { DefaultDirection dir {kLeftDir}; int findex {-1}; float fvalue {0}; + // categorical split, either it's the split category for OHE or the threshold for partition-based + // split. + bst_cat_t thresh{-1}; common::CatBitField split_cats; bool is_cat { false }; @@ -75,22 +78,6 @@ struct DeviceSplitCandidate { *this = other; } } - /** - * \brief The largest encoded category in the split bitset - */ - bst_cat_t MaxCat() const { - // Reuse the fvalue for categorical values. - return static_cast(fvalue); - } - /** - * \brief Return the best threshold for cat split, reset the value after return. - */ - XGBOOST_DEVICE size_t PopBestThresh() { - // fvalue is also being used for storing the threshold for categorical split - auto best_thresh = static_cast(this->fvalue); - this->fvalue = 0; - return best_thresh; - } template XGBOOST_DEVICE void SetCat(T c) { @@ -116,6 +103,26 @@ struct DeviceSplitCandidate { findex = findex_in; } } + + /** + * \brief Update for partition-based splits. + */ + XGBOOST_DEVICE void UpdateCat(float loss_chg_in, DefaultDirection dir_in, bst_cat_t thresh_in, + bst_feature_t findex_in, GradientPairPrecise left_sum_in, + GradientPairPrecise right_sum_in, GPUTrainingParam const& param) { + if (loss_chg_in > loss_chg && left_sum_in.GetHess() >= param.min_child_weight && + right_sum_in.GetHess() >= param.min_child_weight) { + loss_chg = loss_chg_in; + dir = dir_in; + fvalue = std::numeric_limits::quiet_NaN(); + thresh = thresh_in; + is_cat = true; + left_sum = left_sum_in; + right_sum = right_sum_in; + findex = findex_in; + } + } + XGBOOST_DEVICE bool IsValid() const { return loss_chg > 0.0f; } friend std::ostream& operator<<(std::ostream& os, DeviceSplitCandidate const& c) { diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 3f3137c58adc..bafe800cd98b 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -601,13 +601,14 @@ struct GPUHistMakerDevice { auto is_cat = candidate.split.is_cat; if (is_cat) { - CHECK_LT(candidate.split.fvalue, std::numeric_limits::max()) - << "Categorical feature value too large."; - std::vector split_cats; + // should be set to nan in evaluation split. + CHECK(common::CheckNAN(candidate.split.fvalue)); + std::vector split_cats; + CHECK_GT(candidate.split.split_cats.Bits().size(), 0); auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); - auto max_cat = candidate.split.MaxCat(); - split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0); + auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex); + split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature + 1), 0); CHECK_LE(split_cats.size(), h_cats.size()); std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); @@ -616,6 +617,7 @@ struct GPUHistMakerDevice { base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(), candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess()); } else { + CHECK(!common::CheckNAN(candidate.split.fvalue)); tree.ExpandNode(candidate.nid, candidate.split.findex, candidate.split.fvalue, candidate.split.dir == kLeftDir, base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(), From dcfdbcafe8a0f4204b033b6fa2523a602afeb16e Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 30 Aug 2022 23:28:35 +0800 Subject: [PATCH 04/19] Dense, tie breaking. --- src/tree/gpu_hist/evaluate_splits.cu | 16 ++- src/tree/hist/evaluate_splits.h | 2 + .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 122 +++++++++++++++++- tests/cpp/tree/test_evaluate_splits.h | 16 ++- tests/python-gpu/test_gpu_updaters.py | 2 +- tests/python/test_updaters.py | 15 ++- 6 files changed, 158 insertions(+), 15 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 077cd795d68e..aaab931ff3dc 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -27,7 +27,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, evaluator.CalcSplitGain(param, nidx, fidx, left_sum, parent_sum - left_sum); float missing_right_gain = evaluator.CalcSplitGain(param, nidx, fidx, scan, parent_sum - scan); - missing_left_out = missing_left_gain > missing_right_gain; + missing_left_out = missing_left_gain >= missing_right_gain; return missing_left_out?missing_left_gain:missing_right_gain; } @@ -176,6 +176,7 @@ class EvaluateSplitAgent { auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset) : GradientPairPrecise(); + // No min value for cat feature, use inclusive scan. BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op); GradientPairPrecise left_sum = parent_sum - right_sum - missing; @@ -185,6 +186,7 @@ class EvaluateSplitAgent { float gain = thread_active ? LossChangeMissing(left_sum, missing, parent_sum, param, nidx, fidx, evaluator, missing_left) : kNullGain; + // printf("l: %f, r: %f, gain: %f\n", left_sum.GetHess(), right_sum.GetHess(), gain); auto const thresh = threadIdx.x + scan_begin; auto const &forward = missing_left; @@ -210,6 +212,7 @@ class EvaluateSplitAgent { // index of best threshold inside a feature. auto best_thresh = threadIdx.x + (scan_begin - gidx_begin); auto dft_dir = missing_left ? kLeftDir : kRightDir; + // printf("best_thresh: %d\n", static_cast(best_thresh)); best_split->UpdateCat(gain, dft_dir, best_thresh, fidx, left, right, param); } } @@ -291,18 +294,17 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx, shared_inputs.feature_values.size()); size_t node_offset = input_idx * shared_inputs.feature_values.size(); - auto best_thresh = out_split.thresh; + auto const best_thresh = out_split.thresh; if (best_thresh == -1) { + printf("Invalid split\n"); return; } - auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], shared_inputs.FeatureBins(fidx)); + // bool forward = true; bool forward = out_split.dir == kLeftDir; - auto cut_ptr = shared_inputs.feature_segments; - - auto n_bins = shared_inputs.FeatureBins(fidx); - bst_bin_t f_begin = cut_ptr[fidx]; + // printf("best_thresh setter: %d, forward: %d\n", static_cast(best_thresh), + // static_cast(forward)); bst_bin_t partition = forward ? best_thresh + 1 : best_thresh; auto beg = dh::tcbegin(f_sorted_idx); diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index 644eadc30f81..b624152eecfd 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -196,6 +196,8 @@ class HistEvaluator { CHECK_GT(partition, 0); std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) { cat_bits.Set(c); }); // fixme: cut_values[c] + // printf("best_thresh setter: %d, forward: %d\n", static_cast(best_thresh), + // static_cast(d_step == 1)); } p_best->Update(best); diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 28b69122deae..2837036df32d 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -19,6 +19,125 @@ auto ZeroParam() { } } // anonymous namespace +TEST(GpuHist, PartitionBasic) { + TrainParam tparam = ZeroParam(); + tparam.max_cat_to_onehot = 0; + GPUTrainingParam param{tparam}; + + common::HistogramCuts cuts; + cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; + cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; + cuts.min_vals_.HostVector() = std::vector{0.0}; + cuts.cut_ptrs_.SetDevice(0); + cuts.cut_values_.SetDevice(0); + cuts.min_vals_.SetDevice(0); + thrust::device_vector feature_set = std::vector{0}; + + thrust::device_vector monotonic_constraints(feature_set.size(), 0); + dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); + common::Span d_feature_types; + auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end()); + cuts.SetCategorical(true, max_cat); + d_feature_types = dh::ToSpan(feature_types); + + EvaluateSplitSharedInputs shared_inputs{ + param, + d_feature_types, + cuts.cut_ptrs_.ConstDeviceSpan(), + cuts.cut_values_.ConstDeviceSpan(), + cuts.min_vals_.ConstDeviceSpan(), + }; + + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); + + { + // -1.0s go left + // -3.0s go right + GradientPairPrecise parent_sum(-5.0, 3.0); + thrust::device_vector feature_histogram = + std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; + EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } + + // { + // // -1.0s go left + // // -3.0s go right + // GradientPairPrecise parent_sum(-7.0, 3.0); + // thrust::device_vector feature_histogram = + // std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}}; + // EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), + // dh::ToSpan(feature_histogram)}; + // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + // EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000")); + // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + // } + // { + // // All -1.0, gain from splitting should be 0.0 + // GradientPairPrecise parent_sum(-3.0, 3.0); + // thrust::device_vector feature_histogram = + // std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}; + // EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set), + // dh::ToSpan(feature_histogram)}; + // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + // EXPECT_FLOAT_EQ(result.loss_chg, 0.0f); + // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + // } + // With 3.0/3.0 missing values + // All categories go left + // missing values go right + // { + // GradientPairPrecise parent_sum(0.0, 6.0); + // thrust::device_vector feature_histogram = + // std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}; + // EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), + // dh::ToSpan(feature_histogram)}; + // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + // EXPECT_EQ(cats, std::bitset<32>("11100000000000000000000000000000")); + // EXPECT_EQ(result.dir, kRightDir); + // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + // } + // { + // // -1.0s go left + // // -3.0s go right + // GradientPairPrecise parent_sum(-5.0, 3.0); + // thrust::device_vector feature_histogram = + // std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}}; + // EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set), + // dh::ToSpan(feature_histogram)}; + // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + // EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000")); + // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + // } + // { + // // -1.0s go left + // // -3.0s go right + // GradientPairPrecise parent_sum(-5.0, 3.0); + // thrust::device_vector feature_histogram = + // std::vector{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; + // EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set), + // dh::ToSpan(feature_histogram)}; + // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + // EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000")); + // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + // } +} + void TestEvaluateSingleSplit(bool is_categorical) { GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); @@ -263,8 +382,7 @@ TEST(GpuHist, EvaluateSplits) { TEST_F(TestPartitionBasedSplit, GpuHist) { dh::device_vector ft{std::vector{FeatureType::kCategorical}}; - GPUHistEvaluator evaluator{param_, - static_cast(info_.num_col_), 0}; + GPUHistEvaluator evaluator{param_, static_cast(info_.num_col_), 0}; cuts_.cut_ptrs_.SetDevice(0); cuts_.cut_values_.SetDevice(0); diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h index 7925e40cdb26..7db556659e2b 100644 --- a/tests/cpp/tree/test_evaluate_splits.h +++ b/tests/cpp/tree/test_evaluate_splits.h @@ -43,7 +43,7 @@ class TestPartitionBasedSplit : public ::testing::Test { auto &h_vals = cuts_.cut_values_.HostVector(); h_vals.resize(n_bins_); std::iota(h_vals.begin(), h_vals.end(), 0.0); - + cuts_.min_vals_.Resize(1); hist_.Init(cuts_.TotalBins()); @@ -97,5 +97,19 @@ class TestPartitionBasedSplit : public ::testing::Test { } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end())); } }; + +class TestPartitionBasedEvaluation : public testing::Test { + common::HistogramCuts cuts; + + public: + void SetUp() override { + cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; + cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; + cuts.min_vals_.HostVector() = std::vector{0.0}; + + auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end()); + cuts.SetCategorical(true, max_cat); + } +}; } // namespace tree } // namespace xgboost diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index 0b086f0e9282..afb3dbe5cf85 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -84,7 +84,7 @@ def test_categorical_ohe(self, rows, cols, rounds, cats): ) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) - @reproduce_failure('6.47.1', b'AAAAAAA=') + @reproduce_failure('6.47.1', b'AACGAQE=') def test_categorical_missing(self, rows, cols, cats): self.cputest.run_categorical_missing(rows, cols, cats, "gpu_hist") diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 881439292a06..4b864e447f8e 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -5,7 +5,7 @@ import pytest import xgboost as xgb import numpy as np -from hypothesis import given, strategies, settings, note +from hypothesis import given, strategies, settings, note, reproduce_failure exact_parameter_strategy = strategies.fixed_dictionaries({ 'nthread': strategies.integers(1, 4), @@ -234,7 +234,7 @@ def run_categorical_missing( ) -> None: parameters: Dict[str, Any] = {"tree_method": tree_method} cat, label = tm.make_categorical( - n_samples=256, n_features=4, n_categories=8, onehot=False, sparsity=0.5 + n_samples=rows, n_features=cols, n_categories=cats, onehot=False, sparsity=0.5 ) Xy = xgb.DMatrix(cat, label, enable_categorical=True) @@ -246,10 +246,16 @@ def run(max_cat_to_onehot: int): booster = xgb.train( parameters, Xy, - num_boost_round=16, + num_boost_round=8, evals=[(Xy, "Train")], evals_result=evals_result ) + import json + booster.save_model(f"{tree_method}.json") + with open(f"{tree_method}.json", "r") as fd: + model = json.load(fd) + with open(f"{tree_method}.json", "w") as fd: + json.dump(model, fd, indent=2) assert tm.non_increasing(evals_result["Train"]["rmse"]) y_predt = booster.predict(Xy) @@ -394,6 +400,7 @@ def test_categorical_ames_housing( ) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) + @reproduce_failure('6.47.1', b'AACGAQE=') def test_categorical_missing(self, rows, cols, cats): - self.run_categorical_missing(rows, cols, cats, "approx") + # self.run_categorical_missing(rows, cols, cats, "approx") self.run_categorical_missing(rows, cols, cats, "hist") From 0e4489a5b0e8367dae8bb706fe27fbc66cd5c597 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 03:44:54 +0800 Subject: [PATCH 05/19] Scan. --- src/tree/gpu_hist/evaluate_splits.cu | 75 +++++++++++++++------------- src/tree/hist/evaluate_splits.h | 2 - 2 files changed, 41 insertions(+), 36 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index aaab931ff3dc..58e5e41d140e 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -27,7 +27,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, evaluator.CalcSplitGain(param, nidx, fidx, left_sum, parent_sum - left_sum); float missing_right_gain = evaluator.CalcSplitGain(param, nidx, fidx, scan, parent_sum - scan); - missing_left_out = missing_left_gain >= missing_right_gain; + missing_left_out = missing_left_gain > missing_right_gain; return missing_left_out?missing_left_gain:missing_right_gain; } @@ -168,10 +168,10 @@ class EvaluateSplitAgent { } __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split, - bst_feature_t *__restrict__ sorted_idx, + common::Span sorted_idx, std::size_t offset) { - for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) { - bool thread_active = (scan_begin + threadIdx.x) < gidx_end; + for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end - 1; scan_begin += kBlockSize) { + bool thread_active = (scan_begin + threadIdx.x) < (gidx_end - 1); auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset) @@ -179,26 +179,39 @@ class EvaluateSplitAgent { // No min value for cat feature, use inclusive scan. BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op); - GradientPairPrecise left_sum = parent_sum - right_sum - missing; + GradientPairPrecise left_sum = parent_sum - right_sum; + auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum) + : kNullGain; - // Whether the gradient of missing values is put to the left side. - bool missing_left = true; - float gain = thread_active ? LossChangeMissing(left_sum, missing, parent_sum, param, nidx, - fidx, evaluator, missing_left) - : kNullGain; - // printf("l: %f, r: %f, gain: %f\n", left_sum.GetHess(), right_sum.GetHess(), gain); - - auto const thresh = threadIdx.x + scan_begin; - auto const &forward = missing_left; - auto const &backward = !forward; - // prevent splitting on missing value alone, skip the last bin if it's forward scan - // and skip the first bin if it's backward - auto const first_bin = gidx_begin; - auto const last_bin = gidx_end - 1; - bool const should_skip = (forward && thresh == last_bin) || (backward && thresh == first_bin); - if (should_skip) { - gain = kNullGain; + // Find thread with best gain + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + // This reduce result is only valid in thread 0 + // broadcast to the rest of the warp + auto best_thread = __shfl_sync(0xffffffff, best.key, 0); + // Best thread updates the split + if (threadIdx.x == best_thread) { + // index of best threshold inside a feature. + auto best_thresh = threadIdx.x + (scan_begin - gidx_begin); + best_split->UpdateCat(gain, kLeftDir, best_thresh, fidx, left_sum, right_sum, param); } + } + cub::CTA_SYNC(); + + // backward + bst_bin_t n_bins = gidx_end - gidx_begin; + bst_bin_t it_begin = gidx_end - 1; + bst_bin_t it_end = it_begin - n_bins + 1; + SumCallbackOp backward_op; + for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) { + auto it = scan_begin - static_cast(threadIdx.x); + bool thread_active = it > it_end; + auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset) + : GradientPairPrecise(); + BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, backward_op); + GradientPairPrecise right_sum = parent_sum - left_sum; + + auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum) + : kNullGain; // Find thread with best gain auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); @@ -207,13 +220,12 @@ class EvaluateSplitAgent { auto best_thread = __shfl_sync(0xffffffff, best.key, 0); // Best thread updates the split if (threadIdx.x == best_thread) { - GradientPairPrecise left = missing_left ? left_sum + missing : left_sum; - GradientPairPrecise right = parent_sum - left; + assert(thread_active); // index of best threshold inside a feature. - auto best_thresh = threadIdx.x + (scan_begin - gidx_begin); - auto dft_dir = missing_left ? kLeftDir : kRightDir; - // printf("best_thresh: %d\n", static_cast(best_thresh)); - best_split->UpdateCat(gain, dft_dir, best_thresh, fidx, left, right, param); + // auto best_thresh = (it_begin - scan_begin) - static_cast(threadIdx.x); + auto best_thresh = scan_begin - static_cast(threadIdx.x) - gidx_begin; + assert(best_thresh > 0); + best_split->UpdateCat(gain, kRightDir, best_thresh, fidx, left_sum, right_sum, param); } } } @@ -255,7 +267,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel( auto total_bins = shared_inputs.feature_values.size(); size_t offset = total_bins * input_idx; auto node_sorted_idx = sorted_idx.subspan(offset, total_bins); - agent.Partition(&best_split, node_sorted_idx.data(), offset); + agent.Partition(&best_split, node_sorted_idx, offset); } } else { agent.Numerical(&best_split); @@ -296,16 +308,11 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu size_t node_offset = input_idx * shared_inputs.feature_values.size(); auto const best_thresh = out_split.thresh; if (best_thresh == -1) { - printf("Invalid split\n"); return; } auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], shared_inputs.FeatureBins(fidx)); - // bool forward = true; bool forward = out_split.dir == kLeftDir; - // printf("best_thresh setter: %d, forward: %d\n", static_cast(best_thresh), - // static_cast(forward)); - bst_bin_t partition = forward ? best_thresh + 1 : best_thresh; auto beg = dh::tcbegin(f_sorted_idx); assert(partition > 0 && "Invalid partition."); diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index b624152eecfd..644eadc30f81 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -196,8 +196,6 @@ class HistEvaluator { CHECK_GT(partition, 0); std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) { cat_bits.Set(c); }); // fixme: cut_values[c] - // printf("best_thresh setter: %d, forward: %d\n", static_cast(best_thresh), - // static_cast(d_step == 1)); } p_best->Update(best); From b1ada59a9601f16e71725949b975437967eb260a Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 03:45:47 +0800 Subject: [PATCH 06/19] Revert debug. --- tests/python-gpu/test_gpu_updaters.py | 3 +-- tests/python/test_updaters.py | 13 +++---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index afb3dbe5cf85..b2f8b1a27050 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -3,7 +3,7 @@ import gc import pytest import xgboost as xgb -from hypothesis import given, strategies, assume, settings, note, reproduce_failure +from hypothesis import given, strategies, assume, settings, note sys.path.append("tests/python") import testing as tm @@ -84,7 +84,6 @@ def test_categorical_ohe(self, rows, cols, rounds, cats): ) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) - @reproduce_failure('6.47.1', b'AACGAQE=') def test_categorical_missing(self, rows, cols, cats): self.cputest.run_categorical_missing(rows, cols, cats, "gpu_hist") diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 4b864e447f8e..7faa4c58b51d 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -246,16 +246,10 @@ def run(max_cat_to_onehot: int): booster = xgb.train( parameters, Xy, - num_boost_round=8, + num_boost_round=16, evals=[(Xy, "Train")], evals_result=evals_result ) - import json - booster.save_model(f"{tree_method}.json") - with open(f"{tree_method}.json", "r") as fd: - model = json.load(fd) - with open(f"{tree_method}.json", "w") as fd: - json.dump(model, fd, indent=2) assert tm.non_increasing(evals_result["Train"]["rmse"]) y_predt = booster.predict(Xy) @@ -263,7 +257,7 @@ def run(max_cat_to_onehot: int): np.testing.assert_allclose(rmse, evals_result["Train"]["rmse"][-1]) # Test with OHE split - # run(self.USE_ONEHOT) + run(self.USE_ONEHOT) # Test with partition-based split run(self.USE_PART) @@ -400,7 +394,6 @@ def test_categorical_ames_housing( ) @settings(deadline=None, print_blob=True) @pytest.mark.skipif(**tm.no_pandas()) - @reproduce_failure('6.47.1', b'AACGAQE=') def test_categorical_missing(self, rows, cols, cats): - # self.run_categorical_missing(rows, cols, cats, "approx") + self.run_categorical_missing(rows, cols, cats, "approx") self.run_categorical_missing(rows, cols, cats, "hist") From 29515861c3494c6c5d5151d981cda81b9fd24fb9 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 03:48:25 +0800 Subject: [PATCH 07/19] Cleanup. --- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 119 ------------------ tests/cpp/tree/test_evaluate_splits.h | 14 --- tests/python/test_updaters.py | 2 +- 3 files changed, 1 insertion(+), 134 deletions(-) diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 2837036df32d..df57d4f2dcad 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -19,125 +19,6 @@ auto ZeroParam() { } } // anonymous namespace -TEST(GpuHist, PartitionBasic) { - TrainParam tparam = ZeroParam(); - tparam.max_cat_to_onehot = 0; - GPUTrainingParam param{tparam}; - - common::HistogramCuts cuts; - cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; - cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; - cuts.min_vals_.HostVector() = std::vector{0.0}; - cuts.cut_ptrs_.SetDevice(0); - cuts.cut_values_.SetDevice(0); - cuts.min_vals_.SetDevice(0); - thrust::device_vector feature_set = std::vector{0}; - - thrust::device_vector monotonic_constraints(feature_set.size(), 0); - dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); - common::Span d_feature_types; - auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end()); - cuts.SetCategorical(true, max_cat); - d_feature_types = dh::ToSpan(feature_types); - - EvaluateSplitSharedInputs shared_inputs{ - param, - d_feature_types, - cuts.cut_ptrs_.ConstDeviceSpan(), - cuts.cut_values_.ConstDeviceSpan(), - cuts.min_vals_.ConstDeviceSpan(), - }; - - GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; - evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); - - { - // -1.0s go left - // -3.0s go right - GradientPairPrecise parent_sum(-5.0, 3.0); - thrust::device_vector feature_histogram = - std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; - EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), - dh::ToSpan(feature_histogram)}; - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; - auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); - EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); - EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); - EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); - } - - // { - // // -1.0s go left - // // -3.0s go right - // GradientPairPrecise parent_sum(-7.0, 3.0); - // thrust::device_vector feature_histogram = - // std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}}; - // EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), - // dh::ToSpan(feature_histogram)}; - // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; - // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); - // EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000")); - // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); - // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); - // } - // { - // // All -1.0, gain from splitting should be 0.0 - // GradientPairPrecise parent_sum(-3.0, 3.0); - // thrust::device_vector feature_histogram = - // std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}; - // EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set), - // dh::ToSpan(feature_histogram)}; - // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; - // EXPECT_FLOAT_EQ(result.loss_chg, 0.0f); - // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); - // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); - // } - // With 3.0/3.0 missing values - // All categories go left - // missing values go right - // { - // GradientPairPrecise parent_sum(0.0, 6.0); - // thrust::device_vector feature_histogram = - // std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}; - // EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), - // dh::ToSpan(feature_histogram)}; - // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; - // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); - // EXPECT_EQ(cats, std::bitset<32>("11100000000000000000000000000000")); - // EXPECT_EQ(result.dir, kRightDir); - // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); - // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); - // } - // { - // // -1.0s go left - // // -3.0s go right - // GradientPairPrecise parent_sum(-5.0, 3.0); - // thrust::device_vector feature_histogram = - // std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}}; - // EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set), - // dh::ToSpan(feature_histogram)}; - // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; - // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); - // EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000")); - // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); - // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); - // } - // { - // // -1.0s go left - // // -3.0s go right - // GradientPairPrecise parent_sum(-5.0, 3.0); - // thrust::device_vector feature_histogram = - // std::vector{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; - // EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set), - // dh::ToSpan(feature_histogram)}; - // DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; - // auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); - // EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000")); - // EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); - // EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); - // } -} - void TestEvaluateSingleSplit(bool is_categorical) { GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h index 7db556659e2b..50a089fd6997 100644 --- a/tests/cpp/tree/test_evaluate_splits.h +++ b/tests/cpp/tree/test_evaluate_splits.h @@ -97,19 +97,5 @@ class TestPartitionBasedSplit : public ::testing::Test { } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end())); } }; - -class TestPartitionBasedEvaluation : public testing::Test { - common::HistogramCuts cuts; - - public: - void SetUp() override { - cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; - cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; - cuts.min_vals_.HostVector() = std::vector{0.0}; - - auto max_cat = *std::max_element(cuts.Values().begin(), cuts.Values().end()); - cuts.SetCategorical(true, max_cat); - } -}; } // namespace tree } // namespace xgboost diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 7faa4c58b51d..3e43b98ff113 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -5,7 +5,7 @@ import pytest import xgboost as xgb import numpy as np -from hypothesis import given, strategies, settings, note, reproduce_failure +from hypothesis import given, strategies, settings, note exact_parameter_strategy = strategies.fixed_dictionaries({ 'nthread': strategies.integers(1, 4), From 6c3de5eab5c91fa9bea2ed63d075460f733b1679 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 03:48:55 +0800 Subject: [PATCH 08/19] fix. --- src/tree/hist/evaluate_splits.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index 644eadc30f81..5f737675b72e 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -143,6 +143,7 @@ class HistEvaluator { static_assert(d_step == +1 || d_step == -1, "Invalid step."); auto const &cut_ptr = cut.Ptrs(); + auto const &cut_val = cut.Values(); auto const &parent = snode_[nidx]; bst_bin_t f_begin = cut_ptr[fidx]; @@ -194,8 +195,10 @@ class HistEvaluator { common::CatBitField cat_bits{best.cat_bits}; bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin); CHECK_GT(partition, 0); - std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, - [&](size_t c) { cat_bits.Set(c); }); // fixme: cut_values[c] + std::for_each(sorted_idx.begin(), sorted_idx.begin() + partition, [&](size_t c) { + auto cat = cut_val[c + f_begin]; + cat_bits.Set(cat); + }); } p_best->Update(best); From 37c79cbcce840c91309d340ebaf32f540aaa18b1 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 04:16:08 +0800 Subject: [PATCH 09/19] Support max_cat_thresh. --- src/tree/gpu_hist/evaluate_splits.cu | 84 ++++++++++++++-------------- src/tree/updater_gpu_common.cuh | 4 +- 2 files changed, 45 insertions(+), 43 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 58e5e41d140e..257a58000f1d 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -167,66 +167,66 @@ class EvaluateSplitAgent { } } + __device__ __forceinline__ void PartitionUpdate(bst_bin_t scan_begin, bool thread_active, + bool missing_left, bst_bin_t it, + GradientPairPrecise const &left_sum, + GradientPairPrecise const &right_sum, + DeviceSplitCandidate *__restrict__ best_split) { + auto gain = + thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum) : kNullGain; + + // Find thread with best gain + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + // This reduce result is only valid in thread 0 + // broadcast to the rest of the warp + auto best_thread = __shfl_sync(0xffffffff, best.key, 0); + // Best thread updates the split + if (threadIdx.x == best_thread) { + assert(thread_active); + // index of best threshold inside a feature. + auto best_thresh = it - gidx_begin; + best_split->UpdateCat(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left_sum, + right_sum, param); + } + } + __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split, common::Span sorted_idx, - std::size_t offset) { - for (bst_bin_t scan_begin = gidx_begin; scan_begin < gidx_end - 1; scan_begin += kBlockSize) { - bool thread_active = (scan_begin + threadIdx.x) < (gidx_end - 1); + std::size_t offset, GPUTrainingParam const& param) { + bst_bin_t n_bins_feature = gidx_end - gidx_begin; + auto n_bins = std::min(param.max_cat_threshold, n_bins_feature); - auto right_sum = - thread_active ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset) - : GradientPairPrecise(); + bst_bin_t it_begin = gidx_begin; + bst_bin_t it_end = it_begin + n_bins - 1; + // forward + for (bst_bin_t scan_begin = it_begin; scan_begin < it_end; scan_begin += kBlockSize) { + auto it = scan_begin + threadIdx.x; + bool thread_active = it < it_end; + + auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset) + : GradientPairPrecise(); // No min value for cat feature, use inclusive scan. BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op); GradientPairPrecise left_sum = parent_sum - right_sum; - auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum) - : kNullGain; - // Find thread with best gain - auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); - // This reduce result is only valid in thread 0 - // broadcast to the rest of the warp - auto best_thread = __shfl_sync(0xffffffff, best.key, 0); - // Best thread updates the split - if (threadIdx.x == best_thread) { - // index of best threshold inside a feature. - auto best_thresh = threadIdx.x + (scan_begin - gidx_begin); - best_split->UpdateCat(gain, kLeftDir, best_thresh, fidx, left_sum, right_sum, param); - } + PartitionUpdate(scan_begin, thread_active, true, it, left_sum, right_sum, best_split); } - cub::CTA_SYNC(); // backward - bst_bin_t n_bins = gidx_end - gidx_begin; - bst_bin_t it_begin = gidx_end - 1; - bst_bin_t it_end = it_begin - n_bins + 1; + it_begin = gidx_end - 1; + it_end = it_begin - n_bins + 1; SumCallbackOp backward_op; for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) { auto it = scan_begin - static_cast(threadIdx.x); bool thread_active = it > it_end; + auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset) : GradientPairPrecise(); + // No min value for cat feature, use inclusive scan. BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, backward_op); GradientPairPrecise right_sum = parent_sum - left_sum; - - auto gain = thread_active ? evaluator.CalcSplitGain(param, nidx, fidx, left_sum, right_sum) - : kNullGain; - - // Find thread with best gain - auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); - // This reduce result is only valid in thread 0 - // broadcast to the rest of the warp - auto best_thread = __shfl_sync(0xffffffff, best.key, 0); - // Best thread updates the split - if (threadIdx.x == best_thread) { - assert(thread_active); - // index of best threshold inside a feature. - // auto best_thresh = (it_begin - scan_begin) - static_cast(threadIdx.x); - auto best_thresh = scan_begin - static_cast(threadIdx.x) - gidx_begin; - assert(best_thresh > 0); - best_split->UpdateCat(gain, kRightDir, best_thresh, fidx, left_sum, right_sum, param); - } + PartitionUpdate(scan_begin, thread_active, false, it, left_sum, right_sum, best_split); } } }; @@ -267,7 +267,7 @@ __global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel( auto total_bins = shared_inputs.feature_values.size(); size_t offset = total_bins * input_idx; auto node_sorted_idx = sorted_idx.subspan(offset, total_bins); - agent.Partition(&best_split, node_sorted_idx, offset); + agent.Partition(&best_split, node_sorted_idx, offset, shared_inputs.param); } } else { agent.Numerical(&best_split); diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index b9a31dd32477..891ba2a914ac 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -29,6 +29,7 @@ struct GPUTrainingParam { float max_delta_step; float learning_rate; uint32_t max_cat_to_onehot; + bst_bin_t max_cat_threshold; GPUTrainingParam() = default; @@ -38,7 +39,8 @@ struct GPUTrainingParam { reg_alpha(param.reg_alpha), max_delta_step(param.max_delta_step), learning_rate{param.learning_rate}, - max_cat_to_onehot{param.max_cat_to_onehot} {} + max_cat_to_onehot{param.max_cat_to_onehot}, + max_cat_threshold{param.max_cat_threshold} {} }; /** From e52b4e5379d8f3e1348263db9223bc07a9c4d2a2 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 04:20:17 +0800 Subject: [PATCH 10/19] Tests. --- tests/python-gpu/test_gpu_updaters.py | 43 +++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index b2f8b1a27050..d0e7c5bc883d 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -1,3 +1,4 @@ +from typing import Dict, Any import numpy as np import sys import gc @@ -77,6 +78,48 @@ def test_sparse(self, dataset): def test_categorical_ohe(self, rows, cols, rounds, cats): self.cputest.run_categorical_ohe(rows, cols, rounds, cats, "gpu_hist") + @given( + tm.categorical_dataset_strategy, + test_up.exact_parameter_strategy, + test_up.hist_parameter_strategy, + test_up.cat_parameter_strategy, + strategies.integers(4, 32), + ) + @settings(deadline=None, print_blob=True) + @pytest.mark.skipif(**tm.no_pandas()) + def test_categorical( + self, + dataset: tm.TestDataset, + exact_parameters: Dict[str, Any], + hist_parameters: Dict[str, Any], + cat_parameters: Dict[str, Any], + n_rounds: int, + ) -> None: + cat_parameters.update(exact_parameters) + cat_parameters.update(hist_parameters) + cat_parameters["tree_method"] = "gpu_hist" + + results = train_result(cat_parameters, dataset.get_dmat(), n_rounds) + tm.non_increasing(results["train"]["rmse"]) + + @given( + test_up.hist_parameter_strategy, + test_up.cat_parameter_strategy, + ) + @settings(deadline=None, print_blob=True) + def test_categorical_ames_housing( + self, + hist_parameters: Dict[str, Any], + cat_parameters: Dict[str, Any], + ) -> None: + cat_parameters.update(hist_parameters) + dataset = tm.TestDataset( + "ames_housing", tm.get_ames_housing, "reg:squarederror", "rmse" + ) + cat_parameters["tree_method"] = "gpu_hist" + results = train_result(cat_parameters, dataset.get_dmat(), 16) + tm.non_increasing(results["train"]["rmse"]) + @given( strategies.integers(10, 400), strategies.integers(3, 8), From 408f9bcf355e0e7cc3ed2c7399394746db283ea7 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 04:32:48 +0800 Subject: [PATCH 11/19] Cleanup. --- src/tree/gpu_hist/evaluate_splits.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 257a58000f1d..f07b28a8687b 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -216,7 +216,7 @@ class EvaluateSplitAgent { // backward it_begin = gidx_end - 1; it_end = it_begin - n_bins + 1; - SumCallbackOp backward_op; + prefix_op = SumCallbackOp{}; // reset for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) { auto it = scan_begin - static_cast(threadIdx.x); bool thread_active = it > it_end; @@ -224,7 +224,7 @@ class EvaluateSplitAgent { auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset) : GradientPairPrecise(); // No min value for cat feature, use inclusive scan. - BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, backward_op); + BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, prefix_op); GradientPairPrecise right_sum = parent_sum - left_sum; PartitionUpdate(scan_begin, thread_active, false, it, left_sum, right_sum, best_split); } From 2732c6b6c2d5203919925e77295bf6b7bcf5226f Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 04:43:14 +0800 Subject: [PATCH 12/19] Size. --- src/tree/hist/evaluate_splits.h | 2 +- src/tree/updater_gpu_hist.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index 5f737675b72e..0a09718ef911 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -190,7 +190,7 @@ class HistEvaluator { } if (best_thresh != -1) { - auto n = common::CatBitField::ComputeStorageSize(n_bins_feature + 1); + auto n = common::CatBitField::ComputeStorageSize(n_bins_feature); best.cat_bits = decltype(best.cat_bits)(n, 0); common::CatBitField cat_bits{best.cat_bits}; bst_bin_t partition = d_step == 1 ? (best_thresh - it_begin + 1) : (best_thresh - f_begin); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index bafe800cd98b..e1a940a9df34 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -608,7 +608,7 @@ struct GPUHistMakerDevice { CHECK_GT(candidate.split.split_cats.Bits().size(), 0); auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid); auto n_bins_feature = page->Cuts().FeatureBins(candidate.split.findex); - split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature + 1), 0); + split_cats.resize(common::CatBitField::ComputeStorageSize(n_bins_feature), 0); CHECK_LE(split_cats.size(), h_cats.size()); std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data()); From 3d877908b57f83a89a6d00688b1bbb630b756cc4 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 04:44:21 +0800 Subject: [PATCH 13/19] lint. --- python-package/xgboost/dask.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index 951676a81757..75eeba875fee 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -1684,7 +1684,7 @@ async def _() -> Awaitable[Any]: def __getstate__(self) -> Dict: this = self.__dict__.copy() - if "_client" in this.keys(): + if "_client" in this: del this["_client"] return this From 93aab02522c49f329cbd9d8cb37ebc0990cc08fd Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 16:17:45 +0800 Subject: [PATCH 14/19] Fix test. --- tests/cpp/tree/gpu_hist/test_evaluate_splits.cu | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index df57d4f2dcad..99802b61034f 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -68,7 +68,11 @@ void TestEvaluateSingleSplit(bool is_categorical) { DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 1); - EXPECT_EQ(result.fvalue, 11.0); + if (is_categorical) { + ASSERT_TRUE(std::isnan(result.fvalue)); + } else { + EXPECT_EQ(result.fvalue, 11.0); + } EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), From 19a8fbd04e30b4a2aede3672bb0ec5602b0bb0fc Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 31 Aug 2022 17:52:44 +0800 Subject: [PATCH 15/19] Add quick test. --- src/tree/gpu_hist/evaluate_splits.cu | 19 ++++-- src/tree/updater_gpu_common.cuh | 1 + .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 58 ++++++++++++------- tests/cpp/tree/hist/test_evaluate_splits.cc | 28 +++++++++ tests/cpp/tree/test_evaluate_splits.h | 54 +++++++++++++++++ 5 files changed, 134 insertions(+), 26 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index f07b28a8687b..0888e590ba94 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -166,7 +166,9 @@ class EvaluateSplitAgent { } } } - + /** + * \brief Gather and update the best split. + */ __device__ __forceinline__ void PartitionUpdate(bst_bin_t scan_begin, bool thread_active, bool missing_left, bst_bin_t it, GradientPairPrecise const &left_sum, @@ -189,10 +191,13 @@ class EvaluateSplitAgent { right_sum, param); } } - + /** + * \brief Partition-based split for categorical feature. + */ __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split, common::Span sorted_idx, - std::size_t offset, GPUTrainingParam const& param) { + std::size_t node_offset, + GPUTrainingParam const ¶m) { bst_bin_t n_bins_feature = gidx_end - gidx_begin; auto n_bins = std::min(param.max_cat_threshold, n_bins_feature); @@ -201,10 +206,10 @@ class EvaluateSplitAgent { // forward for (bst_bin_t scan_begin = it_begin; scan_begin < it_end; scan_begin += kBlockSize) { - auto it = scan_begin + threadIdx.x; + auto it = scan_begin + static_cast(threadIdx.x); bool thread_active = it < it_end; - auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset) + auto right_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - node_offset) : GradientPairPrecise(); // No min value for cat feature, use inclusive scan. BlockScanT(temp_storage->scan).InclusiveSum(right_sum, right_sum, prefix_op); @@ -217,15 +222,17 @@ class EvaluateSplitAgent { it_begin = gidx_end - 1; it_end = it_begin - n_bins + 1; prefix_op = SumCallbackOp{}; // reset + for (bst_bin_t scan_begin = it_begin; scan_begin > it_end; scan_begin -= kBlockSize) { auto it = scan_begin - static_cast(threadIdx.x); bool thread_active = it > it_end; - auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - offset) + auto left_sum = thread_active ? LoadGpair(node_histogram + sorted_idx[it] - node_offset) : GradientPairPrecise(); // No min value for cat feature, use inclusive scan. BlockScanT(temp_storage->scan).InclusiveSum(left_sum, left_sum, prefix_op); GradientPairPrecise right_sum = parent_sum - left_sum; + PartitionUpdate(scan_begin, thread_active, false, it, left_sum, right_sum, best_split); } } diff --git a/src/tree/updater_gpu_common.cuh b/src/tree/updater_gpu_common.cuh index 891ba2a914ac..49c184a4abbb 100644 --- a/src/tree/updater_gpu_common.cuh +++ b/src/tree/updater_gpu_common.cuh @@ -132,6 +132,7 @@ struct DeviceSplitCandidate { << "dir: " << c.dir << ", " << "findex: " << c.findex << ", " << "fvalue: " << c.fvalue << ", " + << "thresh: " << c.thresh << ", " << "is_cat: " << c.is_cat << ", " << "left sum: " << c.left_sum << ", " << "right sum: " << c.right_sum << std::endl; diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 99802b61034f..2bf798279342 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -2,6 +2,7 @@ * Copyright 2020-2022 by XGBoost contributors */ #include + #include "../../../../src/tree/gpu_hist/evaluate_splits.cuh" #include "../../helpers.h" #include "../../histogram_helpers.h" @@ -17,29 +18,53 @@ auto ZeroParam() { tparam.UpdateAllowUnknown(args); return tparam; } + } // anonymous namespace +TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { + thrust::device_vector feature_set = std::vector{0}; + GPUTrainingParam param{param_}; + cuts_.cut_ptrs_.SetDevice(0); + cuts_.cut_values_.SetDevice(0); + cuts_.min_vals_.SetDevice(0); + thrust::device_vector feature_histogram{feature_histogram_}; + + dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); + auto d_feature_types = dh::ToSpan(feature_types); + + EvaluateSplitInputs input{1, 0, parent_sum_, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + EvaluateSplitSharedInputs shared_inputs{ + param, + d_feature_types, + cuts_.cut_ptrs_.ConstDeviceSpan(), + cuts_.cut_values_.ConstDeviceSpan(), + cuts_.min_vals_.ConstDeviceSpan(), + }; + + GPUHistEvaluator evaluator{param_, static_cast(feature_set.size()), 0}; + + evaluator.Reset(cuts_, dh::ToSpan(feature_types), feature_set.size(), param_, 0); + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + + ASSERT_EQ(result.thresh, 1); + this->CheckResult(result.loss_chg, result.findex, result.fvalue, result.is_cat, + result.dir == kLeftDir, result.left_sum, result.right_sum); +} + void TestEvaluateSingleSplit(bool is_categorical) { GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); GPUTrainingParam param{tparam}; - common::HistogramCuts cuts; - cuts.cut_values_.HostVector() = std::vector{1.0, 2.0, 11.0, 12.0}; - cuts.cut_ptrs_.HostVector() = std::vector{0, 2, 4}; - cuts.min_vals_.HostVector() = std::vector{0.0, 0.0}; - cuts.cut_ptrs_.SetDevice(0); - cuts.cut_values_.SetDevice(0); - cuts.min_vals_.SetDevice(0); - thrust::device_vector feature_set = - std::vector{0, 1}; + common::HistogramCuts cuts{MakeCutsForTest({1.0, 2.0, 11.0, 12.0}, {0, 2, 4}, {0.0, 0.0}, 0)}; + thrust::device_vector feature_set = std::vector{0, 1}; // Setup gradients so that second feature gets higher gain thrust::device_vector feature_histogram = std::vector{ {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}}; - thrust::device_vector monotonic_constraints(feature_set.size(), 0); dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); common::Span d_feature_types; @@ -50,10 +75,8 @@ void TestEvaluateSingleSplit(bool is_categorical) { d_feature_types = dh::ToSpan(feature_types); } - EvaluateSplitInputs input{1,0, - parent_sum, - dh::ToSpan(feature_set), - dh::ToSpan(feature_histogram)}; + EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; EvaluateSplitSharedInputs shared_inputs{ param, d_feature_types, @@ -83,7 +106,7 @@ TEST(GpuHist, EvaluateSingleSplit) { TestEvaluateSingleSplit(false); } -TEST(GpuHist, EvaluateCategoricalSplit) { +TEST(GpuHist, EvaluateSingleCategoricalSplit) { TestEvaluateSingleSplit(true); } @@ -100,7 +123,6 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { thrust::device_vector feature_min_values = std::vector{0.0}; thrust::device_vector feature_histogram = std::vector{{-0.5, 0.5}, {0.5, 0.5}}; - thrust::device_vector monotonic_constraints(feature_set.size(), 0); EvaluateSplitInputs input{1,0, parent_sum, dh::ToSpan(feature_set), @@ -150,7 +172,6 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { thrust::device_vector feature_histogram = std::vector{ {-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; - thrust::device_vector monotonic_constraints(2, 0); EvaluateSplitInputs input{1,0, parent_sum, dh::ToSpan(feature_set), @@ -190,7 +211,6 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { thrust::device_vector feature_histogram = std::vector{ {-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; - thrust::device_vector monotonic_constraints(2, 0); EvaluateSplitInputs input{1,0, parent_sum, dh::ToSpan(feature_set), @@ -231,7 +251,6 @@ TEST(GpuHist, EvaluateSplits) { thrust::device_vector feature_histogram_right = std::vector{ {-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; - thrust::device_vector monotonic_constraints(feature_set.size(), 0); EvaluateSplitInputs input_left{ 1,0, parent_sum, @@ -290,6 +309,5 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split; ASSERT_NEAR(split.loss_chg, best_score_, 1e-16); } - } // namespace tree } // namespace xgboost diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc index 0dede27066a8..37c8597cab53 100644 --- a/tests/cpp/tree/hist/test_evaluate_splits.cc +++ b/tests/cpp/tree/hist/test_evaluate_splits.cc @@ -185,5 +185,33 @@ TEST(HistEvaluator, Categorical) { ASSERT_EQ(with_onehot.split.loss_chg, with_part.split.loss_chg); } + +TEST_F(TestCategoricalSplitWithMissing, HistEvaluator) { + common::HistCollection hist; + hist.Init(cuts_.TotalBins()); + hist.AddHistRow(0); + hist.AllocateAllData(); + auto node_hist = hist[0]; + ASSERT_EQ(node_hist.size(), feature_histogram_.size()); + std::copy(feature_histogram_.cbegin(), feature_histogram_.cend(), node_hist.begin()); + + auto sampler = std::make_shared(); + MetaInfo info; + info.num_col_ = 1; + info.feature_types = {FeatureType::kCategorical}; + auto evaluator = + HistEvaluator{param_, info, common::OmpGetNumThreads(0), sampler}; + evaluator.InitRoot(GradStats{parent_sum_}); + + std::vector entries(1); + RegTree tree; + evaluator.EvaluateSplits(hist, cuts_, info.feature_types.ConstHostSpan(), tree, &entries); + auto const& split = entries.front().split; + + this->CheckResult(split.loss_chg, split.SplitIndex(), split.split_value, split.is_cat, + split.DefaultLeft(), + GradientPairPrecise{split.left_sum.GetGrad(), split.left_sum.GetHess()}, + GradientPairPrecise{split.right_sum.GetGrad(), split.right_sum.GetHess()}); +} } // namespace tree } // namespace xgboost diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h index 50a089fd6997..2421b8ba0e41 100644 --- a/tests/cpp/tree/test_evaluate_splits.h +++ b/tests/cpp/tree/test_evaluate_splits.h @@ -97,5 +97,59 @@ class TestPartitionBasedSplit : public ::testing::Test { } while (std::next_permutation(sorted_idx_.begin(), sorted_idx_.end())); } }; + +inline auto MakeCutsForTest(std::vector values, std::vector ptrs, + std::vector min_values, int32_t device) { + common::HistogramCuts cuts; + cuts.cut_values_.HostVector() = values; + cuts.cut_ptrs_.HostVector() = ptrs; + cuts.min_vals_.HostVector() = min_values; + + if (device >= 0) { + cuts.cut_ptrs_.SetDevice(device); + cuts.cut_values_.SetDevice(device); + cuts.min_vals_.SetDevice(device); + } + + return cuts; +} + +class TestCategoricalSplitWithMissing : public testing::Test { + protected: + common::HistogramCuts cuts_; + // Setup gradients and parent sum with missing values. + GradientPairPrecise parent_sum_{1.0, 6.0}; + std::vector feature_histogram_{ + {0.5, 0.5}, {0.5, 0.5}, {1.0, 1.0}, {1.0, 1.0}}; + TrainParam param_; + + void SetUp() override { + cuts_ = MakeCutsForTest({0.0, 1.0, 2.0, 3.0}, {0, 4}, {0.0}, -1); + auto max_cat = *std::max_element(cuts_.cut_values_.HostVector().begin(), + cuts_.cut_values_.HostVector().end()); + cuts_.SetCategorical(true, max_cat); + param_.UpdateAllowUnknown( + Args{{"min_child_weight", "0"}, {"reg_lambda", "0"}, {"max_cat_to_onehot", "1"}}); + } + + void CheckResult(float loss_chg, bst_feature_t split_ind, float fvalue, bool is_cat, + bool dft_left, GradientPairPrecise left_sum, GradientPairPrecise right_sum) { + // forward + // it: 0, gain: 0.545455 + // it: 1, gain: 1.000000 + // it: 2, gain: 2.250000 + // backward + // it: 3, gain: 1.000000 + // it: 2, gain: 2.250000 + // it: 1, gain: 3.142857 + ASSERT_NEAR(loss_chg, 2.97619, kRtEps); + ASSERT_TRUE(is_cat); + ASSERT_TRUE(std::isnan(fvalue)); + ASSERT_EQ(split_ind, 0); + ASSERT_FALSE(dft_left); + ASSERT_EQ(left_sum.GetHess(), 2.5); + ASSERT_EQ(right_sum.GetHess(), parent_sum_.GetHess() - left_sum.GetHess()); + } +}; } // namespace tree } // namespace xgboost From fb4bba43a29734f055ee11d518689e3d9ec4e9ba Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 6 Sep 2022 17:12:10 +0800 Subject: [PATCH 16/19] Add tests from Rory. --- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 236 ++++++++++++++++++ 1 file changed, 236 insertions(+) diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 2bf798279342..6636dc4c8667 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -52,6 +52,242 @@ TEST_F(TestCategoricalSplitWithMissing, GPUHistEvaluator) { result.dir == kLeftDir, result.left_sum, result.right_sum); } +TEST(GpuHist, PartitionBasic) { + TrainParam tparam = ZeroParam(); + tparam.max_cat_to_onehot = 0; + GPUTrainingParam param{tparam}; + + common::HistogramCuts cuts; + cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; + cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; + cuts.min_vals_.HostVector() = std::vector{0.0}; + cuts.cut_ptrs_.SetDevice(0); + cuts.cut_values_.SetDevice(0); + cuts.min_vals_.SetDevice(0); + thrust::device_vector feature_set = std::vector{0}; + + thrust::device_vector monotonic_constraints(feature_set.size(), 0); + dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); + common::Span d_feature_types; + auto max_cat = + *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end()); + cuts.SetCategorical(true, max_cat); + d_feature_types = dh::ToSpan(feature_types); + + EvaluateSplitSharedInputs shared_inputs{ + param, + d_feature_types, + cuts.cut_ptrs_.ConstDeviceSpan(), + cuts.cut_values_.ConstDeviceSpan(), + cuts.min_vals_.ConstDeviceSpan(), + }; + + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); + + { + // -1.0s go left + // -3.0s go right + GradientPairPrecise parent_sum(-5.0, 3.0); + thrust::device_vector feature_histogram = + std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; + EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } + + { + // -1.0s go left + // -3.0s go right + GradientPairPrecise parent_sum(-7.0, 3.0); + thrust::device_vector feature_histogram = + std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}}; + EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000")); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } + { + // All -1.0, gain from splitting should be 0.0 + GradientPairPrecise parent_sum(-3.0, 3.0); + thrust::device_vector feature_histogram = + std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}; + EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + EXPECT_FLOAT_EQ(result.loss_chg, 0.0f); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } + // With 3.0/3.0 missing values + // All categories go left + // missing values go right + { + GradientPairPrecise parent_sum(0.0, 6.0); + thrust::device_vector feature_histogram = + std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}}; + EvaluateSplitInputs input{3, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); + EXPECT_EQ(result.dir, kLeftDir); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } + { + // -1.0s go left + // -3.0s go right + GradientPairPrecise parent_sum(-5.0, 3.0); + thrust::device_vector feature_histogram = + std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}}; + EvaluateSplitInputs input{4, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000")); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } + { + // -1.0s go left + // -3.0s go right + GradientPairPrecise parent_sum(-5.0, 3.0); + thrust::device_vector feature_histogram = + std::vector{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; + EvaluateSplitInputs input{5, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000")); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } +} + +TEST(GpuHist, PartitionTwoFeatures) { + TrainParam tparam = ZeroParam(); + tparam.max_cat_to_onehot = 0; + GPUTrainingParam param{tparam}; + + common::HistogramCuts cuts; + cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0, 0.0, 1.0, 2.0}; + cuts.cut_ptrs_.HostVector() = std::vector{0, 3, 6}; + cuts.min_vals_.HostVector() = std::vector{0.0, 0.0}; + cuts.cut_ptrs_.SetDevice(0); + cuts.cut_values_.SetDevice(0); + cuts.min_vals_.SetDevice(0); + thrust::device_vector feature_set = std::vector{0, 1}; + + thrust::device_vector monotonic_constraints(feature_set.size(), 0); + dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); + common::Span d_feature_types(dh::ToSpan(feature_types)); + auto max_cat = + *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end()); + cuts.SetCategorical(true, max_cat); + + EvaluateSplitSharedInputs shared_inputs{ + param, + d_feature_types, + cuts.cut_ptrs_.ConstDeviceSpan(), + cuts.cut_values_.ConstDeviceSpan(), + cuts.min_vals_.ConstDeviceSpan(), + }; + + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); + + { + GradientPairPrecise parent_sum(-6.0, 3.0); + thrust::device_vector feature_histogram = std::vector{ + {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}; + EvaluateSplitInputs input{0, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(result.findex, 1); + EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } + + { + GradientPairPrecise parent_sum(-6.0, 3.0); + thrust::device_vector feature_histogram = std::vector{ + {-2.0, 1.0}, {-2.0, 1.0}, {-2.0, 1.0}, {-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}}; + EvaluateSplitInputs input{1, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(result.findex, 1); + EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000")); + EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); + EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); + } +} + +TEST(GpuHist, PartitionTwoNodes) { + TrainParam tparam = ZeroParam(); + tparam.max_cat_to_onehot = 0; + GPUTrainingParam param{tparam}; + + common::HistogramCuts cuts; + cuts.cut_values_.HostVector() = std::vector{0.0, 1.0, 2.0}; + cuts.cut_ptrs_.HostVector() = std::vector{0, 3}; + cuts.min_vals_.HostVector() = std::vector{0.0}; + cuts.cut_ptrs_.SetDevice(0); + cuts.cut_values_.SetDevice(0); + cuts.min_vals_.SetDevice(0); + thrust::device_vector feature_set = std::vector{0}; + + thrust::device_vector monotonic_constraints(feature_set.size(), 0); + dh::device_vector feature_types(feature_set.size(), FeatureType::kCategorical); + common::Span d_feature_types(dh::ToSpan(feature_types)); + auto max_cat = + *std::max_element(cuts.cut_values_.HostVector().begin(), cuts.cut_values_.HostVector().end()); + cuts.SetCategorical(true, max_cat); + + EvaluateSplitSharedInputs shared_inputs{ + param, + d_feature_types, + cuts.cut_ptrs_.ConstDeviceSpan(), + cuts.cut_values_.ConstDeviceSpan(), + cuts.min_vals_.ConstDeviceSpan(), + }; + + GPUHistEvaluator evaluator{tparam, static_cast(feature_set.size()), 0}; + evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); + + { + GradientPairPrecise parent_sum(-6.0, 3.0); + thrust::device_vector feature_histogram_a = + std::vector{{-1.0, 1.0}, {-2.5, 1.0}, {-2.5, 1.0}, + {-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}; + thrust::device_vector inputs(2); + inputs[0] = EvaluateSplitInputs{0, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram_a)}; + thrust::device_vector feature_histogram_b = + std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-4.0, 1.0}}; + inputs[1] = EvaluateSplitInputs{1, 0, parent_sum, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram_b)}; + thrust::device_vector results(2); + evaluator.EvaluateSplits({0, 1}, 1, dh::ToSpan(inputs), shared_inputs, dh::ToSpan(results)); + GPUExpandEntry result_a = results[0]; + GPUExpandEntry result_b = results[1]; + EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(0)[0]), + std::bitset<32>("10000000000000000000000000000000")); + EXPECT_EQ(std::bitset<32>(evaluator.GetHostNodeCats(1)[0]), + std::bitset<32>("11000000000000000000000000000000")); + } +} + void TestEvaluateSingleSplit(bool is_categorical) { GradientPairPrecise parent_sum(0.0, 1.0); TrainParam tparam = ZeroParam(); From 76ac92da70e6c5865a8ac38fff205cf725d3f035 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 6 Sep 2022 17:17:57 +0800 Subject: [PATCH 17/19] add check for direction. --- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 6636dc4c8667..5b070921925b 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -86,8 +86,8 @@ TEST(GpuHist, PartitionBasic) { evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); { - // -1.0s go left - // -3.0s go right + // -1.0s go right + // -3.0s go left GradientPairPrecise parent_sum(-5.0, 3.0); thrust::device_vector feature_histogram = std::vector{{-1.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; @@ -95,14 +95,15 @@ TEST(GpuHist, PartitionBasic) { dh::ToSpan(feature_histogram)}; DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("11000000000000000000000000000000")); EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); } { - // -1.0s go left - // -3.0s go right + // -1.0s go right + // -3.0s go left GradientPairPrecise parent_sum(-7.0, 3.0); thrust::device_vector feature_histogram = std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-3.0, 1.0}}; @@ -110,6 +111,7 @@ TEST(GpuHist, PartitionBasic) { dh::ToSpan(feature_histogram)}; DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("10000000000000000000000000000000")); EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); @@ -122,13 +124,13 @@ TEST(GpuHist, PartitionBasic) { EvaluateSplitInputs input{2, 0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; + EXPECT_EQ(result.dir, kLeftDir); EXPECT_FLOAT_EQ(result.loss_chg, 0.0f); EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); } // With 3.0/3.0 missing values - // All categories go left - // missing values go right + // Forward, first 2 categories are selected, while the last one go to left along with missing value { GradientPairPrecise parent_sum(0.0, 6.0); thrust::device_vector feature_histogram = @@ -143,8 +145,8 @@ TEST(GpuHist, PartitionBasic) { EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); } { - // -1.0s go left - // -3.0s go right + // -1.0s go right + // -3.0s go left GradientPairPrecise parent_sum(-5.0, 3.0); thrust::device_vector feature_histogram = std::vector{{-1.0, 1.0}, {-3.0, 1.0}, {-1.0, 1.0}}; @@ -152,6 +154,7 @@ TEST(GpuHist, PartitionBasic) { dh::ToSpan(feature_histogram)}; DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("10100000000000000000000000000000")); EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); @@ -166,6 +169,7 @@ TEST(GpuHist, PartitionBasic) { dh::ToSpan(feature_histogram)}; DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); + EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000")); EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); From 4acc9b4781fce30a4785d9e8a37bebe0b1c356f0 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 6 Sep 2022 17:26:46 +0800 Subject: [PATCH 18/19] comment. --- tests/cpp/tree/gpu_hist/test_evaluate_splits.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 5b070921925b..9228074625a0 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -160,8 +160,8 @@ TEST(GpuHist, PartitionBasic) { EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess()); } { - // -1.0s go left - // -3.0s go right + // -1.0s go right + // -3.0s go left GradientPairPrecise parent_sum(-5.0, 3.0); thrust::device_vector feature_histogram = std::vector{{-3.0, 1.0}, {-1.0, 1.0}, {-3.0, 1.0}}; From 8e43aa39c6f2d0cc4bbfe654c583ab45e146de82 Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 6 Sep 2022 18:16:20 +0800 Subject: [PATCH 19/19] floating point. --- tests/cpp/tree/gpu_hist/test_evaluate_splits.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 9228074625a0..6ef8350bc505 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -169,7 +169,6 @@ TEST(GpuHist, PartitionBasic) { dh::ToSpan(feature_histogram)}; DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; auto cats = std::bitset<32>(evaluator.GetHostNodeCats(input.nidx)[0]); - EXPECT_EQ(result.dir, kLeftDir); EXPECT_EQ(cats, std::bitset<32>("01000000000000000000000000000000")); EXPECT_FLOAT_EQ(result.left_sum.GetGrad() + result.right_sum.GetGrad(), parent_sum.GetGrad()); EXPECT_FLOAT_EQ(result.left_sum.GetHess() + result.right_sum.GetHess(), parent_sum.GetHess());