From 9dc183ae448dcc157ca453d9cc28a44308771d73 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 20 Jun 2022 08:22:28 -0700 Subject: [PATCH 01/17] Shared inputs --- src/tree/gpu_hist/evaluate_splits.cu | 108 +++++++-------- src/tree/gpu_hist/evaluate_splits.cuh | 34 ++--- src/tree/gpu_hist/evaluator.cu | 25 ++-- src/tree/updater_gpu_hist.cu | 105 +++++++-------- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 123 +++++++++--------- 5 files changed, 200 insertions(+), 195 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 5261c1b6ae03..4610527c7fd2 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -78,16 +78,16 @@ ReduceFeature(common::Span feature_histogram, template __device__ void EvaluateFeature( - int fidx, EvaluateSplitInputs inputs, + int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span sorted_idx, size_t offset, DeviceSplitCandidate *best_split, // shared memory storing best split TempStorageT *temp_storage // temp memory for cub operations ) { // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = inputs.feature_segments[fidx]; // beginning bin + uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin uint32_t gidx_end = - inputs.feature_segments[fidx + 1]; // end bin for i^th feature + shared_inputs.feature_segments[fidx + 1]; // end bin for i^th feature auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); // Sum histogram bins for current feature @@ -133,7 +133,7 @@ __device__ void EvaluateFeature( bool missing_left = true; float gain = null_gain; if (thread_active) { - gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, inputs.param, + gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, shared_inputs.param, inputs.nidx, fidx, evaluator, missing_left); } @@ -156,40 +156,40 @@ __device__ void EvaluateFeature( switch (type) { case kNum: { // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = inputs.feature_segments[fidx]; // beginning bin + uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin int split_gidx = (scan_begin + threadIdx.x) - 1; float fvalue; if (split_gidx < static_cast(gidx_begin)) { - fvalue = inputs.min_fvalue[fidx]; + fvalue = shared_inputs.min_fvalue[fidx]; } else { - fvalue = inputs.feature_values[split_gidx]; + fvalue = shared_inputs.feature_values[split_gidx]; } GradientPairPrecise left = missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; GradientPairPrecise right = inputs.parent_sum - left; best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, - false, inputs.param); + false, shared_inputs.param); break; } case kOneHot: { int32_t split_gidx = (scan_begin + threadIdx.x); - float fvalue = inputs.feature_values[split_gidx]; + float fvalue = shared_inputs.feature_values[split_gidx]; GradientPairPrecise left = missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; GradientPairPrecise right = inputs.parent_sum - left; best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, - true, inputs.param); + true, shared_inputs.param); break; } case kPart: { int32_t split_gidx = (scan_begin + threadIdx.x); - float fvalue = inputs.feature_values[split_gidx]; + float fvalue = shared_inputs.feature_values[split_gidx]; GradientPairPrecise left = missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; GradientPairPrecise right = inputs.parent_sum - left; auto best_thresh = block_max.key; // index of best threshold inside a feature. best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, - right, true, inputs.param); + right, true, shared_inputs.param); break; } } @@ -199,8 +199,9 @@ __device__ void EvaluateFeature( } template -__global__ void EvaluateSplitsKernel(EvaluateSplitInputs left, - EvaluateSplitInputs right, +__global__ void EvaluateSplitsKernel(EvaluateSplitInputs left, + EvaluateSplitInputs right, + const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, TreeEvaluator::SplitEvaluator evaluator, common::Span out_candidates) { @@ -231,28 +232,28 @@ __global__ void EvaluateSplitsKernel(EvaluateSplitInputs left, // If this block is working on the left or right node bool is_left = blockIdx.x < left.feature_set.size(); - EvaluateSplitInputs& inputs = is_left ? left : right; + EvaluateSplitInputs& inputs = is_left ? left : right; // One block for each feature. Features are sampled, so fidx != blockIdx.x int fidx = inputs.feature_set[is_left ? blockIdx.x : blockIdx.x - left.feature_set.size()]; - if (common::IsCat(inputs.feature_types, fidx)) { - auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx]; - if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot)) { + if (common::IsCat(shared_inputs.feature_types, fidx)) { + auto n_bins_in_feat = shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx]; + if (common::UseOneHot(n_bins_in_feat, shared_inputs.param.max_cat_to_onehot)) { EvaluateFeature(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + kOneHot>(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); } else { - auto node_sorted_idx = is_left ? sorted_idx.first(inputs.feature_values.size()) - : sorted_idx.last(inputs.feature_values.size()); - size_t offset = is_left ? 0 : inputs.feature_values.size(); + auto node_sorted_idx = is_left ? sorted_idx.first(shared_inputs.feature_values.size()) + : sorted_idx.last(shared_inputs.feature_values.size()); + size_t offset = is_left ? 0 : shared_inputs.feature_values.size(); EvaluateFeature(fidx, inputs, evaluator, node_sorted_idx, offset, &best_split, + kPart>(fidx, inputs,shared_inputs, evaluator, node_sorted_idx, offset, &best_split, &temp_storage); } } else { EvaluateFeature(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + kNum>(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); } cub::CTA_SYNC(); @@ -270,8 +271,7 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a, /** * \brief Set the bits for categorical splits based on the split threshold. */ -template -__device__ void SetCategoricalSplit(EvaluateSplitInputs const &input, +__device__ void SetCategoricalSplit(EvaluateSplitInputs const &input,const EvaluateSplitSharedInputs &shared_inputs, common::Span d_sorted_idx, bst_feature_t fidx, bool is_left, common::Span out, DeviceSplitCandidate *p_out_split) { @@ -279,18 +279,18 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs const &inp out_split.split_cats = common::CatBitField{out}; // Simple case for one hot split - if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) { + if (common::UseOneHot(shared_inputs.FeatureBins(fidx), shared_inputs.param.max_cat_to_onehot)) { out_split.split_cats.Set(common::AsCat(out_split.fvalue)); return; } auto node_sorted_idx = - is_left ? d_sorted_idx.subspan(0, input.feature_values.size()) - : d_sorted_idx.subspan(input.feature_values.size(), input.feature_values.size()); - size_t node_offset = is_left ? 0 : input.feature_values.size(); + is_left ? d_sorted_idx.subspan(0, shared_inputs.feature_values.size()) + : d_sorted_idx.subspan(shared_inputs.feature_values.size(), shared_inputs.feature_values.size()); + size_t node_offset = is_left ? 0 : shared_inputs.feature_values.size(); auto best_thresh = out_split.PopBestThresh(); auto f_sorted_idx = - node_sorted_idx.subspan(input.feature_segments[fidx], input.FeatureBins(fidx)); + node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], shared_inputs.FeatureBins(fidx)); if (out_split.dir != kLeftDir) { // forward, missing on right auto beg = dh::tcbegin(f_sorted_idx); @@ -299,7 +299,7 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs const &inp boundary = std::max(boundary, static_cast(1ul)); auto end = beg + boundary; thrust::for_each(thrust::seq, beg, end, [&](auto c) { - auto cat = input.feature_values[c - node_offset]; + auto cat = shared_inputs.feature_values[c - node_offset]; assert(!out_split.split_cats.Check(cat) && "already set"); out_split.SetCat(cat); }); @@ -307,19 +307,19 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs const &inp assert((f_sorted_idx.size() - best_thresh + 1) != 0 && " == 0"); thrust::for_each(thrust::seq, dh::tcrbegin(f_sorted_idx), dh::tcrbegin(f_sorted_idx) + (f_sorted_idx.size() - best_thresh), [&](auto c) { - auto cat = input.feature_values[c - node_offset]; + auto cat = shared_inputs.feature_values[c - node_offset]; out_split.SetCat(cat); }); } } template -void GPUHistEvaluator::EvaluateSplits( - EvaluateSplitInputs left, EvaluateSplitInputs right, - TreeEvaluator::SplitEvaluator evaluator, +void GPUHistEvaluator::LaunchEvaluateSplits( + EvaluateSplitInputs left, EvaluateSplitInputs right, + EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits) { if (need_sort_histogram_) { - this->SortHistogram(left, right, evaluator); + this->SortHistogram(left, right, shared_inputs,evaluator); } size_t combined_num_features = left.feature_set.size() + right.feature_set.size(); @@ -328,7 +328,7 @@ void GPUHistEvaluator::EvaluateSplits( // One block for each feature uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, left, right, this->SortedIdx(left), + EvaluateSplitsKernel, left, right, shared_inputs, this->SortedIdx(left,shared_inputs), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features @@ -357,7 +357,7 @@ void GPUHistEvaluator::EvaluateSplits( } template -void GPUHistEvaluator::CopyToHost(EvaluateSplitInputs const &input, +void GPUHistEvaluator::CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out) { if (cats_out.empty()) return; dh::CUDAEvent event; @@ -370,16 +370,17 @@ void GPUHistEvaluator::CopyToHost(EvaluateSplitInputs void GPUHistEvaluator::EvaluateSplits(GPUExpandEntry candidate, - EvaluateSplitInputs left, - EvaluateSplitInputs right, + EvaluateSplitInputs left, + EvaluateSplitInputs right, + EvaluateSplitSharedInputs shared_inputs, common::Span out_entries) { auto evaluator = this->tree_evaluator_.template GetEvaluator(); dh::TemporaryArray splits_out_storage(2); auto out_splits = dh::ToSpan(splits_out_storage); - this->EvaluateSplits(left, right, evaluator, out_splits); + this->LaunchEvaluateSplits(left, right, shared_inputs,evaluator, out_splits); - auto d_sorted_idx = this->SortedIdx(left); + auto d_sorted_idx = this->SortedIdx(left,shared_inputs); auto d_entries = out_entries; auto cats_out = this->DeviceCatStorage(left.nidx); // turn candidate into entry, along with handling sort based split. @@ -391,13 +392,13 @@ void GPUHistEvaluator::EvaluateSplits(GPUExpandEntry candidate, if (split.is_cat) { bool is_left = i == 0; auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2); - SetCategoricalSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]); + SetCategoricalSplit(input, shared_inputs,d_sorted_idx, fidx, is_left, out, &out_splits[i]); } float base_weight = - evaluator.CalcWeight(input.nidx, input.param, GradStats{split.left_sum + split.right_sum}); - float left_weight = evaluator.CalcWeight(input.nidx, input.param, GradStats{split.left_sum}); - float right_weight = evaluator.CalcWeight(input.nidx, input.param, GradStats{split.right_sum}); + evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.left_sum + split.right_sum}); + float left_weight = evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.left_sum}); + float right_weight = evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.right_sum}); d_entries[i] = GPUExpandEntry{input.nidx, candidate.depth + 1, out_splits[i], base_weight, left_weight, right_weight}; @@ -408,14 +409,14 @@ void GPUHistEvaluator::EvaluateSplits(GPUExpandEntry candidate, template GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( - EvaluateSplitInputs input, float weight) { + EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs,float weight) { dh::TemporaryArray splits_out(1); auto out_split = dh::ToSpan(splits_out); auto evaluator = tree_evaluator_.GetEvaluator(); - this->EvaluateSplits(input, {}, evaluator, out_split); + this->LaunchEvaluateSplits(input, {},shared_inputs, evaluator, out_split); auto cats_out = this->DeviceCatStorage(input.nidx); - auto d_sorted_idx = this->SortedIdx(input); + auto d_sorted_idx = this->SortedIdx(input,shared_inputs); dh::TemporaryArray entries(1); auto d_entries = entries.data().get(); @@ -424,11 +425,11 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( auto fidx = out_split[i].findex; if (split.is_cat) { - SetCategoricalSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]); + SetCategoricalSplit(input,shared_inputs, d_sorted_idx, fidx, true, cats_out, &out_split[i]); } - float left_weight = evaluator.CalcWeight(0, input.param, GradStats{split.left_sum}); - float right_weight = evaluator.CalcWeight(0, input.param, GradStats{split.right_sum}); + float left_weight = evaluator.CalcWeight(0, shared_inputs.param, GradStats{split.left_sum}); + float right_weight = evaluator.CalcWeight(0, shared_inputs.param, GradStats{split.right_sum}); d_entries[0] = GPUExpandEntry(0, 0, split, weight, left_weight, right_weight); }); this->CopyToHost(input, cats_out); @@ -439,7 +440,6 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( return root_entry; } -template class GPUHistEvaluator; template class GPUHistEvaluator; } // namespace tree } // namespace xgboost diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 08b0270ee4d7..6b2e31c50992 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -17,18 +17,22 @@ class HistogramCuts; } namespace tree { -template + +// Inputs specific to each node struct EvaluateSplitInputs { int nidx; GradientPairPrecise parent_sum; - GPUTrainingParam param; common::Span feature_set; + common::Span gradient_histogram; +}; + +// Inputs necessary for all nodes +struct EvaluateSplitSharedInputs { + GPUTrainingParam param; common::Span feature_types; common::Span feature_segments; common::Span feature_values; common::Span min_fvalue; - common::Span gradient_histogram; - XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; } __device__ auto FeatureBins(bst_feature_t fidx) const { return feature_segments[fidx + 1] - feature_segments[fidx]; @@ -66,7 +70,7 @@ class GPUHistEvaluator { std::size_t node_categorical_storage_size_ = 0; // Copy the categories from device to host asynchronously. - void CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out); + void CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out); /** * \brief Get host category storage of nidx for internal calculation. @@ -105,16 +109,16 @@ class GPUHistEvaluator { /** * \brief Get sorted index storage based on the left node of inputs. */ - auto SortedIdx(EvaluateSplitInputs left) { + auto SortedIdx(EvaluateSplitInputs left, EvaluateSplitSharedInputs shared_inputs) { if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { - return dh::ToSpan(cat_sorted_idx_).first(left.feature_values.size()); + return dh::ToSpan(cat_sorted_idx_).first(shared_inputs.feature_values.size()); } return dh::ToSpan(cat_sorted_idx_); } - auto SortInput(EvaluateSplitInputs left) { + auto SortInput(EvaluateSplitInputs left, EvaluateSplitSharedInputs shared_inputs) { if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { - return dh::ToSpan(sort_input_).first(left.feature_values.size()); + return dh::ToSpan(sort_input_).first(shared_inputs.feature_values.size()); } return dh::ToSpan(sort_input_); } @@ -155,25 +159,25 @@ class GPUHistEvaluator { * \brief Sort the histogram based on output to obtain contiguous partitions. */ common::Span SortHistogram( - EvaluateSplitInputs const &left, EvaluateSplitInputs const &right, + EvaluateSplitInputs const &left, EvaluateSplitInputs const &right,EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator); // impl of evaluate splits, contains CUDA kernels so it's public - void EvaluateSplits(EvaluateSplitInputs left, - EvaluateSplitInputs right, + void LaunchEvaluateSplits(EvaluateSplitInputs left, + EvaluateSplitInputs right,EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits); /** * \brief Evaluate splits for left and right nodes. */ void EvaluateSplits(GPUExpandEntry candidate, - EvaluateSplitInputs left, - EvaluateSplitInputs right, + EvaluateSplitInputs left, + EvaluateSplitInputs right,EvaluateSplitSharedInputs shared_inputs, common::Span out_splits); /** * \brief Evaluate splits for root node. */ - GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input, float weight); + GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,EvaluateSplitSharedInputs shared_inputs, float weight); }; } // namespace tree } // namespace xgboost diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index aaf35243b2f5..edcf01ff3268 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -69,22 +69,22 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, template common::Span GPUHistEvaluator::SortHistogram( - EvaluateSplitInputs const &left, EvaluateSplitInputs const &right, + EvaluateSplitInputs const &left, EvaluateSplitInputs const &right, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator) { dh::XGBCachingDeviceAllocator alloc; - auto sorted_idx = this->SortedIdx(left); + auto sorted_idx = this->SortedIdx(left,shared_inputs); dh::Iota(sorted_idx); - auto data = this->SortInput(left); + auto data = this->SortInput(left,shared_inputs); auto it = thrust::make_counting_iterator(0u); auto d_feature_idx = dh::ToSpan(feature_idx_); thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data), [=] XGBOOST_DEVICE(uint32_t i) { - auto is_left = i < left.feature_values.size(); + auto is_left = i < shared_inputs.feature_values.size(); auto const &input = is_left ? left : right; - auto j = i - (is_left ? 0 : input.feature_values.size()); + auto j = i - (is_left ? 0 : shared_inputs.feature_values.size()); auto fidx = d_feature_idx[j]; - if (common::IsCat(input.feature_types, fidx)) { - auto lw = evaluator.CalcWeightCat(input.param, input.gradient_histogram[j]); + if (common::IsCat(shared_inputs.feature_types, fidx)) { + auto lw = evaluator.CalcWeightCat(shared_inputs.param, input.gradient_histogram[j]); return thrust::make_tuple(i, lw); } return thrust::make_tuple(i, 0.0); @@ -95,16 +95,15 @@ common::Span GPUHistEvaluator::SortHistogram( auto li = thrust::get<0>(l); auto ri = thrust::get<0>(r); - auto l_is_left = li < left.feature_values.size(); - auto r_is_left = ri < left.feature_values.size(); + auto l_is_left = li < shared_inputs.feature_values.size(); + auto r_is_left = ri < shared_inputs.feature_values.size(); if (l_is_left != r_is_left) { return l_is_left; // not the same node } - auto const &input = l_is_left ? left : right; - li -= (l_is_left ? 0 : input.feature_values.size()); - ri -= (r_is_left ? 0 : input.feature_values.size()); + li -= (l_is_left ? 0 : shared_inputs.feature_values.size()); + ri -= (r_is_left ? 0 : shared_inputs.feature_values.size()); auto lfidx = d_feature_idx[li]; auto rfidx = d_feature_idx[ri]; @@ -113,7 +112,7 @@ common::Span GPUHistEvaluator::SortHistogram( return lfidx < rfidx; // not the same feature } - if (common::IsCat(input.feature_types, lfidx)) { + if (common::IsCat(shared_inputs.feature_types, lfidx)) { auto lw = thrust::get<1>(l); auto rw = thrust::get<1>(r); return lw < rw; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index ae209cdaf205..743247a8ca98 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -195,6 +195,7 @@ struct GPUHistMakerDevice { HistRounding histogram_rounding; dh::PinnedMemory pinned; + dh::PinnedMemory pinned2; common::Monitor monitor; common::ColumnSampler column_sampler; @@ -278,57 +279,58 @@ struct GPUHistMakerDevice { common::Span feature_set = interaction_constraints.Query(sampled_features->DeviceSpan(), nidx); auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); - EvaluateSplitInputs inputs{nidx, - root_sum, - gpu_param, - feature_set, - feature_types, - matrix.feature_segments, - matrix.gidx_fvalue_map, - matrix.min_fvalue, - hist.GetNodeHistogram(nidx)}; - auto split = this->evaluator_.EvaluateSingleSplit(inputs, weight); + EvaluateSplitInputs inputs{nidx, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; + EvaluateSplitSharedInputs shared_inputs{ + gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, + matrix.min_fvalue, + }; + auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs, weight); return split; } - void EvaluateLeftRightSplits(GPUExpandEntry candidate, int left_nidx, int right_nidx, + void EvaluateLeftRightSplits(const std::vector &candidates, const RegTree& tree, common::Span pinned_candidates_out) { - dh::TemporaryArray splits_out(2); - GPUTrainingParam gpu_param(param); - auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx)); - left_sampled_features->SetDevice(ctx_->gpu_id); - common::Span left_feature_set = - interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx); - auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx)); - right_sampled_features->SetDevice(ctx_->gpu_id); - common::Span right_feature_set = - interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); - auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); + for (int i = 0; i < candidates.size(); i++) { + auto candidate = candidates.at(i); + int left_nidx = tree[candidate.nid].LeftChild(); + int right_nidx = tree[candidate.nid].RightChild(); + dh::TemporaryArray splits_out(2); + GPUTrainingParam gpu_param(param); + auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx)); + left_sampled_features->SetDevice(ctx_->gpu_id); + common::Span left_feature_set = + interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx); + auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx)); + right_sampled_features->SetDevice(ctx_->gpu_id); + common::Span right_feature_set = + interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); + auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); + auto h_node_inputs = pinned2.GetSpan(2); + dh::TemporaryArray d_node_inputs(2); + h_node_inputs[0] = {left_nidx, candidate.split.left_sum, left_feature_set, + hist.GetNodeHistogram(left_nidx)}; + h_node_inputs[1] = {right_nidx, candidate.split.right_sum, right_feature_set, + hist.GetNodeHistogram(right_nidx)}; + dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(),h_node_inputs.data(),h_node_inputs.size()*sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); + + EvaluateSplitInputs left{left_nidx, candidate.split.left_sum, left_feature_set, + hist.GetNodeHistogram(left_nidx)}; + EvaluateSplitInputs right{right_nidx, + candidate.split.right_sum, + right_feature_set, + hist.GetNodeHistogram(right_nidx)}; + EvaluateSplitSharedInputs shared_inputs{ + gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, + matrix.min_fvalue, + }; - EvaluateSplitInputs left{left_nidx, - candidate.split.left_sum, - gpu_param, - left_feature_set, - feature_types, - matrix.feature_segments, - matrix.gidx_fvalue_map, - matrix.min_fvalue, - hist.GetNodeHistogram(left_nidx)}; - EvaluateSplitInputs right{right_nidx, - candidate.split.right_sum, - gpu_param, - right_feature_set, - feature_types, - matrix.feature_segments, - matrix.gidx_fvalue_map, - matrix.min_fvalue, - hist.GetNodeHistogram(right_nidx)}; - - dh::TemporaryArray entries(2); - this->evaluator_.EvaluateSplits(candidate, left, right, dh::ToSpan(entries)); - dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(), - sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); + dh::TemporaryArray entries(2); + this->evaluator_.EvaluateSplits(candidate, left, right, shared_inputs, dh::ToSpan(entries)); + dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.subspan(i * 2, 2).data(), + entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), + cudaMemcpyDeviceToHost)); + } } void BuildHist(int nidx) { @@ -697,16 +699,9 @@ struct GPUHistMakerDevice { this->BuildHistLeftRight(filtered_expand_set, reducer, tree); monitor.Stop("BuildHist"); - for (auto i = 0ull; i < filtered_expand_set.size(); i++) { - auto candidate = filtered_expand_set.at(i); - int left_child_nidx = tree[candidate.nid].LeftChild(); - int right_child_nidx = tree[candidate.nid].RightChild(); - - monitor.Start("EvaluateSplits"); - this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree, - new_candidates.subspan(i * 2, 2)); - monitor.Stop("EvaluateSplits"); - } + monitor.Start("EvaluateSplits"); + this->EvaluateLeftRightSplits(filtered_expand_set, *p_tree, new_candidates); + monitor.Stop("EvaluateSplits"); dh::DefaultStream().Sync(); driver.Push(new_candidates.begin(), new_candidates.end()); expand_set = driver.Pop(); diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index fb1726004a39..fa98419f5223 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -35,8 +35,8 @@ void TestEvaluateSingleSplit(bool is_categorical) { std::vector{0, 1}; // Setup gradients so that second feature gets higher gain - thrust::device_vector feature_histogram = - std::vector{ + thrust::device_vector feature_histogram = + std::vector{ {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); @@ -50,21 +50,23 @@ void TestEvaluateSingleSplit(bool is_categorical) { d_feature_types = dh::ToSpan(feature_types); } - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1, parent_sum, - param, dh::ToSpan(feature_set), - d_feature_types, - cuts.cut_ptrs_.ConstDeviceSpan(), - cuts.cut_values_.ConstDeviceSpan(), - cuts.min_vals_.ConstDeviceSpan(), dh::ToSpan(feature_histogram)}; + EvaluateSplitSharedInputs shared_inputs{ + param, + d_feature_types, + cuts.cut_ptrs_.ConstDeviceSpan(), + cuts.cut_values_.ConstDeviceSpan(), + cuts.min_vals_.ConstDeviceSpan(), + }; - GPUHistEvaluator evaluator{ + GPUHistEvaluator evaluator{ tparam, static_cast(feature_set.size()), 0}; evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); DeviceSplitCandidate result = - evaluator.EvaluateSingleSplit(input, 0).split; + evaluator.EvaluateSingleSplit(input, shared_inputs,0).split; EXPECT_EQ(result.findex, 1); EXPECT_EQ(result.fvalue, 11.0); @@ -93,21 +95,23 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { std::vector{0, 2}; thrust::device_vector feature_values = std::vector{1.0, 2.0}; thrust::device_vector feature_min_values = std::vector{0.0}; - thrust::device_vector feature_histogram = - std::vector{{-0.5, 0.5}, {0.5, 0.5}}; + thrust::device_vector feature_histogram = + std::vector{{-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1, parent_sum, - param, dh::ToSpan(feature_set), - {}, - dh::ToSpan(feature_segments), - dh::ToSpan(feature_values), - dh::ToSpan(feature_min_values), dh::ToSpan(feature_histogram)}; + EvaluateSplitSharedInputs shared_inputs{ + param, + {}, + dh::ToSpan(feature_segments), + dh::ToSpan(feature_values), + dh::ToSpan(feature_min_values), + }; - GPUHistEvaluator evaluator(tparam, feature_set.size(), 0); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, 0).split; + GPUHistEvaluator evaluator(tparam, feature_set.size(), 0); + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs,0).split; EXPECT_EQ(result.findex, 0); EXPECT_EQ(result.fvalue, 1.0); @@ -118,9 +122,9 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { TEST(GpuHist, EvaluateSingleSplitEmpty) { TrainParam tparam = ZeroParam(); - GPUHistEvaluator evaluator(tparam, 1, 0); + GPUHistEvaluator evaluator(tparam, 1, 0); DeviceSplitCandidate result = - evaluator.EvaluateSingleSplit(EvaluateSplitInputs{}, 0).split; + evaluator.EvaluateSingleSplit(EvaluateSplitInputs{}, EvaluateSplitSharedInputs{}, 0).split; EXPECT_EQ(result.findex, -1); EXPECT_LT(result.loss_chg, 0.0f); } @@ -140,22 +144,24 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { std::vector{1.0, 2.0, 11.0, 12.0}; thrust::device_vector feature_min_values = std::vector{0.0, 10.0}; - thrust::device_vector feature_histogram = - std::vector{ + thrust::device_vector feature_histogram = + std::vector{ {-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(2, 0); - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1, parent_sum, - param, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + EvaluateSplitSharedInputs shared_inputs{ + param, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), dh::ToSpan(feature_min_values), - dh::ToSpan(feature_histogram)}; + }; - GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, 0).split; + GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs, 0).split; EXPECT_EQ(result.findex, 1); EXPECT_EQ(result.fvalue, 11.0); @@ -178,22 +184,24 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { std::vector{1.0, 2.0, 11.0, 12.0}; thrust::device_vector feature_min_values = std::vector{0.0, 10.0}; - thrust::device_vector feature_histogram = - std::vector{ + thrust::device_vector feature_histogram = + std::vector{ {-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(2, 0); - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1, parent_sum, - param, dh::ToSpan(feature_set), + dh::ToSpan(feature_histogram)}; + EvaluateSplitSharedInputs shared_inputs{ + param, {}, dh::ToSpan(feature_segments), dh::ToSpan(feature_values), dh::ToSpan(feature_min_values), - dh::ToSpan(feature_histogram)}; + }; - GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); - DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, 0).split; + GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); + DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs, 0).split; EXPECT_EQ(result.findex, 0); EXPECT_EQ(result.fvalue, 1.0); @@ -214,37 +222,34 @@ TEST(GpuHist, EvaluateSplits) { std::vector{1.0, 2.0, 11.0, 12.0}; thrust::device_vector feature_min_values = std::vector{0.0, 0.0}; - thrust::device_vector feature_histogram_left = - std::vector{ + thrust::device_vector feature_histogram_left = + std::vector{ {-0.5, 0.5}, {0.5, 0.5}, {-1.0, 0.5}, {1.0, 0.5}}; - thrust::device_vector feature_histogram_right = - std::vector{ + thrust::device_vector feature_histogram_right = + std::vector{ {-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); - EvaluateSplitInputs input_left{ + EvaluateSplitInputs input_left{ 1, parent_sum, - param, dh::ToSpan(feature_set), - {}, - dh::ToSpan(feature_segments), - dh::ToSpan(feature_values), - dh::ToSpan(feature_min_values), dh::ToSpan(feature_histogram_left)}; - EvaluateSplitInputs input_right{ + EvaluateSplitInputs input_right{ 2, parent_sum, - param, dh::ToSpan(feature_set), - {}, - dh::ToSpan(feature_segments), - dh::ToSpan(feature_values), - dh::ToSpan(feature_min_values), dh::ToSpan(feature_histogram_right)}; + EvaluateSplitSharedInputs shared_inputs{ + param, + {}, + dh::ToSpan(feature_segments), + dh::ToSpan(feature_values), + dh::ToSpan(feature_min_values), + }; - GPUHistEvaluator evaluator{ + GPUHistEvaluator evaluator{ tparam, static_cast(feature_min_values.size()), 0}; - evaluator.EvaluateSplits(input_left, input_right, evaluator.GetEvaluator(), + evaluator.LaunchEvaluateSplits(input_left, input_right,shared_inputs, evaluator.GetEvaluator(), dh::ToSpan(out_splits)); DeviceSplitCandidate result_left = out_splits[0]; @@ -273,16 +278,18 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { cudaMemcpyHostToDevice)); dh::device_vector feature_set{std::vector{0}}; - EvaluateSplitInputs input{0, + EvaluateSplitInputs input{0, total_gpair_, - GPUTrainingParam{param_}, dh::ToSpan(feature_set), + dh::ToSpan(d_hist)}; + EvaluateSplitSharedInputs shared_inputs{ + GPUTrainingParam{ param_}, dh::ToSpan(ft), cuts_.cut_ptrs_.ConstDeviceSpan(), cuts_.cut_values_.ConstDeviceSpan(), cuts_.min_vals_.ConstDeviceSpan(), - dh::ToSpan(d_hist)}; - auto split = evaluator.EvaluateSingleSplit(input, 0).split; + }; + auto split = evaluator.EvaluateSingleSplit(input, shared_inputs, 0).split; ASSERT_NEAR(split.loss_chg, best_score_, 1e-16); } } // namespace tree From 8074e13e77e58b0d41c12b8facc766c0be5c5a74 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 21 Jun 2022 05:36:41 -0700 Subject: [PATCH 02/17] Refactoring --- src/tree/gpu_hist/evaluate_splits.cu | 23 ++++++++-------- src/tree/gpu_hist/evaluate_splits.cuh | 20 +++++++------- src/tree/gpu_hist/evaluator.cu | 6 ++--- src/tree/updater_gpu_hist.cu | 26 +++++++++---------- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 3 ++- 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 4610527c7fd2..30eb17157079 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -314,12 +314,12 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs const &input,const Evalu } template -void GPUHistEvaluator::LaunchEvaluateSplits( +void GPUHistEvaluator::LaunchEvaluateSplits(common::Span d_inputs, EvaluateSplitInputs left, EvaluateSplitInputs right, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits) { if (need_sort_histogram_) { - this->SortHistogram(left, right, shared_inputs,evaluator); + this->SortHistogram(d_inputs,left, right, shared_inputs,evaluator); } size_t combined_num_features = left.feature_set.size() + right.feature_set.size(); @@ -328,7 +328,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits( // One block for each feature uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, left, right, shared_inputs, this->SortedIdx(left,shared_inputs), + EvaluateSplitsKernel, left, right, shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features @@ -369,23 +369,23 @@ void GPUHistEvaluator::CopyToHost(EvaluateSplitInputs const &input } template -void GPUHistEvaluator::EvaluateSplits(GPUExpandEntry candidate, +void GPUHistEvaluator::EvaluateSplits(common::Span d_inputs,GPUExpandEntry candidate, EvaluateSplitInputs left, EvaluateSplitInputs right, EvaluateSplitSharedInputs shared_inputs, common::Span out_entries) { auto evaluator = this->tree_evaluator_.template GetEvaluator(); - dh::TemporaryArray splits_out_storage(2); + dh::TemporaryArray splits_out_storage(d_inputs.size()); auto out_splits = dh::ToSpan(splits_out_storage); - this->LaunchEvaluateSplits(left, right, shared_inputs,evaluator, out_splits); + this->LaunchEvaluateSplits(d_inputs,left, right, shared_inputs,evaluator, out_splits); - auto d_sorted_idx = this->SortedIdx(left,shared_inputs); + auto d_sorted_idx = this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()); auto d_entries = out_entries; auto cats_out = this->DeviceCatStorage(left.nidx); // turn candidate into entry, along with handling sort based split. - dh::LaunchN(right.feature_set.empty() ? 1 : 2, [=] __device__(size_t i) { - auto const &input = i == 0 ? left : right; + dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) { + auto const input = d_inputs[i]; auto &split = out_splits[i]; auto fidx = out_splits[i].findex; @@ -413,10 +413,11 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( dh::TemporaryArray splits_out(1); auto out_split = dh::ToSpan(splits_out); auto evaluator = tree_evaluator_.GetEvaluator(); - this->LaunchEvaluateSplits(input, {},shared_inputs, evaluator, out_split); + dh::device_vector inputs = std::vector{input}; + this->LaunchEvaluateSplits(dh::ToSpan(inputs),input, {},shared_inputs, evaluator, out_split); auto cats_out = this->DeviceCatStorage(input.nidx); - auto d_sorted_idx = this->SortedIdx(input,shared_inputs); + auto d_sorted_idx = this->SortedIdx(inputs.size(), shared_inputs.feature_values.size()); dh::TemporaryArray entries(1); auto d_entries = entries.data().get(); diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 6b2e31c50992..5bec8af32675 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -109,17 +109,15 @@ class GPUHistEvaluator { /** * \brief Get sorted index storage based on the left node of inputs. */ - auto SortedIdx(EvaluateSplitInputs left, EvaluateSplitSharedInputs shared_inputs) { - if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { - return dh::ToSpan(cat_sorted_idx_).first(shared_inputs.feature_values.size()); - } + auto SortedIdx(int num_nodes, bst_feature_t total_bins) { + if(!need_sort_histogram_) return common::Span(); + cat_sorted_idx_.resize(num_nodes * total_bins); return dh::ToSpan(cat_sorted_idx_); } - auto SortInput(EvaluateSplitInputs left, EvaluateSplitSharedInputs shared_inputs) { - if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) { - return dh::ToSpan(sort_input_).first(shared_inputs.feature_values.size()); - } + auto SortInput(int num_nodes, bst_feature_t total_bins) { + if(!need_sort_histogram_) return common::Span(); + sort_input_.resize(num_nodes * total_bins); return dh::ToSpan(sort_input_); } @@ -158,19 +156,19 @@ class GPUHistEvaluator { /** * \brief Sort the histogram based on output to obtain contiguous partitions. */ - common::Span SortHistogram( + common::Span SortHistogram(common::Span d_inputs, EvaluateSplitInputs const &left, EvaluateSplitInputs const &right,EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator); // impl of evaluate splits, contains CUDA kernels so it's public - void LaunchEvaluateSplits(EvaluateSplitInputs left, + void LaunchEvaluateSplits(common::Span d_inputs,EvaluateSplitInputs left, EvaluateSplitInputs right,EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits); /** * \brief Evaluate splits for left and right nodes. */ - void EvaluateSplits(GPUExpandEntry candidate, + void EvaluateSplits(common::Span d_inputs,GPUExpandEntry candidate, EvaluateSplitInputs left, EvaluateSplitInputs right,EvaluateSplitSharedInputs shared_inputs, common::Span out_splits); diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index edcf01ff3268..6fd505298173 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -68,13 +68,13 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, } template -common::Span GPUHistEvaluator::SortHistogram( +common::Span GPUHistEvaluator::SortHistogram(common::Span d_inputs, EvaluateSplitInputs const &left, EvaluateSplitInputs const &right, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator) { dh::XGBCachingDeviceAllocator alloc; - auto sorted_idx = this->SortedIdx(left,shared_inputs); + auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); dh::Iota(sorted_idx); - auto data = this->SortInput(left,shared_inputs); + auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size()); auto it = thrust::make_counting_iterator(0u); auto d_feature_idx = dh::ToSpan(feature_idx_); thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data), diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 743247a8ca98..d29b9322f279 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -288,9 +288,9 @@ struct GPUHistMakerDevice { return split; } - void EvaluateLeftRightSplits(const std::vector &candidates, - const RegTree& tree, + void EvaluateLeftRightSplits(const std::vector& candidates, const RegTree& tree, common::Span pinned_candidates_out) { + dh::TemporaryArray d_node_inputs(2); for (int i = 0; i < candidates.size(); i++) { auto candidate = candidates.at(i); int left_nidx = tree[candidate.nid].LeftChild(); @@ -307,29 +307,29 @@ struct GPUHistMakerDevice { interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto h_node_inputs = pinned2.GetSpan(2); - dh::TemporaryArray d_node_inputs(2); h_node_inputs[0] = {left_nidx, candidate.split.left_sum, left_feature_set, hist.GetNodeHistogram(left_nidx)}; h_node_inputs[1] = {right_nidx, candidate.split.right_sum, right_feature_set, hist.GetNodeHistogram(right_nidx)}; - dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(),h_node_inputs.data(),h_node_inputs.size()*sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); + dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), + h_node_inputs.size() * sizeof(EvaluateSplitInputs), + cudaMemcpyDefault)); EvaluateSplitInputs left{left_nidx, candidate.split.left_sum, left_feature_set, hist.GetNodeHistogram(left_nidx)}; - EvaluateSplitInputs right{right_nidx, - candidate.split.right_sum, - right_feature_set, - hist.GetNodeHistogram(right_nidx)}; - EvaluateSplitSharedInputs shared_inputs{ - gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, - matrix.min_fvalue, - }; + EvaluateSplitInputs right{right_nidx, candidate.split.right_sum, right_feature_set, + hist.GetNodeHistogram(right_nidx)}; + EvaluateSplitSharedInputs shared_inputs{ + gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, + matrix.min_fvalue, + }; dh::TemporaryArray entries(2); - this->evaluator_.EvaluateSplits(candidate, left, right, shared_inputs, dh::ToSpan(entries)); + this->evaluator_.EvaluateSplits(dh::ToSpan(d_node_inputs), candidate, left, right, shared_inputs, dh::ToSpan(entries)); dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.subspan(i * 2, 2).data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); + dh::DefaultStream().Sync(); } } diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index fa98419f5223..a0440d5bce4a 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -249,7 +249,8 @@ TEST(GpuHist, EvaluateSplits) { GPUHistEvaluator evaluator{ tparam, static_cast(feature_min_values.size()), 0}; - evaluator.LaunchEvaluateSplits(input_left, input_right,shared_inputs, evaluator.GetEvaluator(), + dh::device_vector inputs = std::vector{input_left,input_right}; + evaluator.LaunchEvaluateSplits(dh::ToSpan(inputs),input_left, input_right,shared_inputs, evaluator.GetEvaluator(), dh::ToSpan(out_splits)); DeviceSplitCandidate result_left = out_splits[0]; From a688eef11839545511fdd4da037ba7cacb44d8cd Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Tue, 21 Jun 2022 07:58:08 -0700 Subject: [PATCH 03/17] Per node categorical storage --- src/tree/gpu_hist/evaluate_splits.cu | 35 ++++++++++++++------------- src/tree/gpu_hist/evaluate_splits.cuh | 30 ++++++++++++++--------- src/tree/gpu_hist/evaluator.cu | 1 + src/tree/updater_gpu_hist.cu | 3 ++- 4 files changed, 39 insertions(+), 30 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 30eb17157079..e89b7a9ce357 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -357,19 +357,21 @@ void GPUHistEvaluator::LaunchEvaluateSplits(common::Span -void GPUHistEvaluator::CopyToHost(EvaluateSplitInputs const &input, - common::Span cats_out) { - if (cats_out.empty()) return; +void GPUHistEvaluator::CopyToHost( const std::vector& nidx) { + if (!has_categoricals_) return; dh::CUDAEvent event; event.Record(dh::DefaultStream()); - auto h_cats = this->HostCatStorage(input.nidx); - copy_stream_.View().Wait(event); - dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(), - cudaMemcpyDeviceToHost, copy_stream_.View())); + for (auto idx : nidx) { + auto h_cats = this->HostCatStorage(idx); + auto d_cats = this->DeviceCatStorage(nidx).GetNodeCatStorage(idx); + copy_stream_.View().Wait(event); + dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), d_cats.data(), d_cats.size_bytes(), + cudaMemcpyDeviceToHost, copy_stream_.View())); + } } template -void GPUHistEvaluator::EvaluateSplits(common::Span d_inputs,GPUExpandEntry candidate, +void GPUHistEvaluator::EvaluateSplits(const std::vector &nidx, common::Span d_inputs,GPUExpandEntry candidate, EvaluateSplitInputs left, EvaluateSplitInputs right, EvaluateSplitSharedInputs shared_inputs, @@ -382,17 +384,16 @@ void GPUHistEvaluator::EvaluateSplits(common::SpanSortedIdx(d_inputs.size(),shared_inputs.feature_values.size()); auto d_entries = out_entries; - auto cats_out = this->DeviceCatStorage(left.nidx); + auto device_cats_accessor = this->DeviceCatStorage(nidx); // turn candidate into entry, along with handling sort based split. - dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) { + dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable{ auto const input = d_inputs[i]; auto &split = out_splits[i]; auto fidx = out_splits[i].findex; if (split.is_cat) { bool is_left = i == 0; - auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2); - SetCategoricalSplit(input, shared_inputs,d_sorted_idx, fidx, is_left, out, &out_splits[i]); + SetCategoricalSplit(input, shared_inputs,d_sorted_idx, fidx, is_left, device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]); } float base_weight = @@ -404,7 +405,7 @@ void GPUHistEvaluator::EvaluateSplits(common::SpanCopyToHost(left, cats_out); + this->CopyToHost(nidx); } template @@ -416,24 +417,24 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( dh::device_vector inputs = std::vector{input}; this->LaunchEvaluateSplits(dh::ToSpan(inputs),input, {},shared_inputs, evaluator, out_split); - auto cats_out = this->DeviceCatStorage(input.nidx); + auto device_cats_accessor = this->DeviceCatStorage({input.nidx}); auto d_sorted_idx = this->SortedIdx(inputs.size(), shared_inputs.feature_values.size()); dh::TemporaryArray entries(1); auto d_entries = entries.data().get(); - dh::LaunchN(1, [=] __device__(size_t i) { + dh::LaunchN(1, [=] __device__(size_t i) mutable{ auto &split = out_split[i]; auto fidx = out_split[i].findex; if (split.is_cat) { - SetCategoricalSplit(input,shared_inputs, d_sorted_idx, fidx, true, cats_out, &out_split[i]); + SetCategoricalSplit(input,shared_inputs, d_sorted_idx, fidx, true, device_cats_accessor.GetNodeCatStorage(input.nidx), &out_split[i]); } float left_weight = evaluator.CalcWeight(0, shared_inputs.param, GradStats{split.left_sum}); float right_weight = evaluator.CalcWeight(0, shared_inputs.param, GradStats{split.right_sum}); d_entries[0] = GPUExpandEntry(0, 0, split, weight, left_weight, right_weight); }); - this->CopyToHost(input, cats_out); + this->CopyToHost({input.nidx}); GPUExpandEntry root_entry; dh::safe_cuda(cudaMemcpyAsync(&root_entry, entries.data().get(), diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 5bec8af32675..21105b1de98b 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -39,6 +39,15 @@ struct EvaluateSplitSharedInputs { } }; +struct DeviceCatAccessor { + common::Span cat_storage_; + std::size_t node_categorical_storage_size_; + XGBOOST_DEVICE common::Span GetNodeCatStorage(bst_node_t nidx) { + return this->cat_storage_.subspan(nidx * this->node_categorical_storage_size_, + this->node_categorical_storage_size_); + } +}; + template class GPUHistEvaluator { using CatST = common::CatBitField::value_type; // categorical storage type @@ -65,18 +74,18 @@ class GPUHistEvaluator { // Do we have any categorical features that require sorting histograms? // use this to skip the expensive sort step bool need_sort_histogram_ = false; + bool has_categoricals_ = false; // Number of elements of categorical storage type // needed to hold categoricals for a single mode std::size_t node_categorical_storage_size_ = 0; // Copy the categories from device to host asynchronously. - void CopyToHost(EvaluateSplitInputs const &input, common::Span cats_out); + void CopyToHost( const std::vector& nidx); /** * \brief Get host category storage of nidx for internal calculation. */ auto HostCatStorage(bst_node_t nidx) { - std::size_t min_size=(nidx+2)*node_categorical_storage_size_; if(h_split_cats_.size() &nidx) { + if (!has_categoricals_) return DeviceCatAccessor{}; + auto max_nidx = *std::max_element(nidx.begin(), nidx.end()); + std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_; + if (split_cats_.size() < min_size) { split_cats_.resize(min_size); } - if (nidx == RegTree::kRoot) { - auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_); - return cats_out; - } - auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2); - return cats_out; + return DeviceCatAccessor{dh::ToSpan(split_cats_), node_categorical_storage_size_}; } /** @@ -168,7 +174,7 @@ class GPUHistEvaluator { /** * \brief Evaluate splits for left and right nodes. */ - void EvaluateSplits(common::Span d_inputs,GPUExpandEntry candidate, + void EvaluateSplits(const std::vector &nidx,common::Span d_inputs,GPUExpandEntry candidate, EvaluateSplitInputs left, EvaluateSplitInputs right,EvaluateSplitSharedInputs shared_inputs, common::Span out_splits); diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 6fd505298173..f46a13a8f0a8 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -21,6 +21,7 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, int32_t device) { param_ = param; tree_evaluator_ = TreeEvaluator{param, n_features, device}; + has_categoricals_ = cuts.HasCategorical(); if (cuts.HasCategorical()) { dh::XGBCachingDeviceAllocator alloc; auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan(); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index d29b9322f279..6ca6ec9f7d93 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -325,7 +325,8 @@ struct GPUHistMakerDevice { }; dh::TemporaryArray entries(2); - this->evaluator_.EvaluateSplits(dh::ToSpan(d_node_inputs), candidate, left, right, shared_inputs, dh::ToSpan(entries)); + std::vector nidx = {left_nidx, right_nidx}; + this->evaluator_.EvaluateSplits(nidx,dh::ToSpan(d_node_inputs), candidate, left, right, shared_inputs, dh::ToSpan(entries)); dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.subspan(i * 2, 2).data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); From 44fb1507fbe2be81bfe43eb37cf23fdb49288216 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 22 Jun 2022 05:10:28 -0700 Subject: [PATCH 04/17] Sort histogram for many nodes. --- src/tree/gpu_hist/evaluate_splits.cu | 9 +++++---- src/tree/gpu_hist/evaluate_splits.cuh | 2 +- src/tree/gpu_hist/evaluator.cu | 27 ++++++++++++++++----------- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index e89b7a9ce357..291ac2766c80 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -314,12 +314,13 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs const &input,const Evalu } template -void GPUHistEvaluator::LaunchEvaluateSplits(common::Span d_inputs, - EvaluateSplitInputs left, EvaluateSplitInputs right, - EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, +void GPUHistEvaluator::LaunchEvaluateSplits( + common::Span d_inputs, EvaluateSplitInputs left, + EvaluateSplitInputs right, EvaluateSplitSharedInputs shared_inputs, + TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits) { if (need_sort_histogram_) { - this->SortHistogram(d_inputs,left, right, shared_inputs,evaluator); + this->SortHistogram(d_inputs, shared_inputs, evaluator); } size_t combined_num_features = left.feature_set.size() + right.feature_set.size(); diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 21105b1de98b..84fb6aef9e3d 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -163,7 +163,7 @@ class GPUHistEvaluator { * \brief Sort the histogram based on output to obtain contiguous partitions. */ common::Span SortHistogram(common::Span d_inputs, - EvaluateSplitInputs const &left, EvaluateSplitInputs const &right,EvaluateSplitSharedInputs shared_inputs, + EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator); // impl of evaluate splits, contains CUDA kernels so it's public diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index f46a13a8f0a8..703599d2b668 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -70,7 +70,7 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, template common::Span GPUHistEvaluator::SortHistogram(common::Span d_inputs, - EvaluateSplitInputs const &left, EvaluateSplitInputs const &right, EvaluateSplitSharedInputs shared_inputs, + EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator) { dh::XGBCachingDeviceAllocator alloc; auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); @@ -78,33 +78,38 @@ common::Span GPUHistEvaluator::SortHistogram( auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size()); auto it = thrust::make_counting_iterator(0u); auto d_feature_idx = dh::ToSpan(feature_idx_); + auto total_bins = shared_inputs.feature_values.size(); thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data), [=] XGBOOST_DEVICE(uint32_t i) { - auto is_left = i < shared_inputs.feature_values.size(); - auto const &input = is_left ? left : right; - auto j = i - (is_left ? 0 : shared_inputs.feature_values.size()); + auto const &input = d_inputs[i / total_bins]; + auto j = i % total_bins; auto fidx = d_feature_idx[j]; if (common::IsCat(shared_inputs.feature_types, fidx)) { - auto lw = evaluator.CalcWeightCat(shared_inputs.param, input.gradient_histogram[j]); + auto lw = evaluator.CalcWeightCat(shared_inputs.param, + input.gradient_histogram[j]); return thrust::make_tuple(i, lw); } return thrust::make_tuple(i, 0.0); }); + // Sort an array segmented according to + // - nodes + // - features within each node + // - gradients within each feature thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data), dh::tbegin(sorted_idx), [=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) { auto li = thrust::get<0>(l); auto ri = thrust::get<0>(r); - auto l_is_left = li < shared_inputs.feature_values.size(); - auto r_is_left = ri < shared_inputs.feature_values.size(); + auto l_node = li / total_bins; + auto r_node = ri / total_bins; - if (l_is_left != r_is_left) { - return l_is_left; // not the same node + if (l_node != r_node) { + return l_node < r_node; // not the same node } - li -= (l_is_left ? 0 : shared_inputs.feature_values.size()); - ri -= (r_is_left ? 0 : shared_inputs.feature_values.size()); + li = li % total_bins; + ri = ri % total_bins; auto lfidx = d_feature_idx[li]; auto rfidx = d_feature_idx[ri]; From 8fcfe996ca3bd22510ad77c7aee7d9cdc28ccfbe Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 22 Jun 2022 05:56:11 -0700 Subject: [PATCH 05/17] Remove left right args. --- src/tree/gpu_hist/evaluate_splits.cu | 51 +++++++------------ src/tree/gpu_hist/evaluate_splits.cuh | 8 ++- src/tree/updater_gpu_hist.cu | 7 ++- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 2 +- 4 files changed, 27 insertions(+), 41 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 291ac2766c80..7714024edec5 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -199,8 +199,7 @@ __device__ void EvaluateFeature( } template -__global__ void EvaluateSplitsKernel(EvaluateSplitInputs left, - EvaluateSplitInputs right, +__global__ void EvaluateSplitsKernel(bst_feature_t number_active_features,common::Span d_inputs, const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, TreeEvaluator::SplitEvaluator evaluator, @@ -230,13 +229,11 @@ __global__ void EvaluateSplitsKernel(EvaluateSplitInputs left, __syncthreads(); - // If this block is working on the left or right node - bool is_left = blockIdx.x < left.feature_set.size(); - EvaluateSplitInputs& inputs = is_left ? left : right; - + // Allocate blocks to one feature of one node + const auto input_idx = blockIdx.x / number_active_features; + const EvaluateSplitInputs &inputs = d_inputs[input_idx]; // One block for each feature. Features are sampled, so fidx != blockIdx.x - int fidx = inputs.feature_set[is_left ? blockIdx.x - : blockIdx.x - left.feature_set.size()]; + int fidx = inputs.feature_set[blockIdx.x % number_active_features]; if (common::IsCat(shared_inputs.feature_types, fidx)) { auto n_bins_in_feat = shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx]; @@ -244,12 +241,12 @@ __global__ void EvaluateSplitsKernel(EvaluateSplitInputs left, EvaluateFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); } else { - auto node_sorted_idx = is_left ? sorted_idx.first(shared_inputs.feature_values.size()) - : sorted_idx.last(shared_inputs.feature_values.size()); - size_t offset = is_left ? 0 : shared_inputs.feature_values.size(); + auto total_bins = shared_inputs.feature_values.size(); + size_t offset = total_bins * input_idx; + auto node_sorted_idx = sorted_idx.subspan(offset, total_bins); EvaluateFeature(fidx, inputs,shared_inputs, evaluator, node_sorted_idx, offset, &best_split, - &temp_storage); + kPart>(fidx, inputs, shared_inputs, evaluator, node_sorted_idx, offset, + &best_split, &temp_storage); } } else { EvaluateFeature -void GPUHistEvaluator::LaunchEvaluateSplits( - common::Span d_inputs, EvaluateSplitInputs left, - EvaluateSplitInputs right, EvaluateSplitSharedInputs shared_inputs, +void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_active_features, + common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits) { if (need_sort_histogram_) { this->SortHistogram(d_inputs, shared_inputs, evaluator); } - size_t combined_num_features = left.feature_set.size() + right.feature_set.size(); + size_t combined_num_features = number_active_features*d_inputs.size(); dh::TemporaryArray feature_best_splits(combined_num_features); // One block for each feature uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, left, right, shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), + EvaluateSplitsKernel, number_active_features,d_inputs, shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features auto reduce_offset = dh::MakeTransformIterator(thrust::make_counting_iterator(0llu), [=] __device__(size_t idx) -> size_t { - if (idx == 0) { - return 0; - } - if (idx == 1) { - return left.feature_set.size(); - } - if (idx == 2) { - return combined_num_features; - } - return 0; + return idx*number_active_features; }); size_t temp_storage_bytes = 0; auto num_segments = out_splits.size(); @@ -372,16 +359,14 @@ void GPUHistEvaluator::CopyToHost( const std::vector& } template -void GPUHistEvaluator::EvaluateSplits(const std::vector &nidx, common::Span d_inputs,GPUExpandEntry candidate, - EvaluateSplitInputs left, - EvaluateSplitInputs right, +void GPUHistEvaluator::EvaluateSplits(const std::vector &nidx, bst_feature_t number_active_features,common::Span d_inputs,GPUExpandEntry candidate, EvaluateSplitSharedInputs shared_inputs, common::Span out_entries) { auto evaluator = this->tree_evaluator_.template GetEvaluator(); dh::TemporaryArray splits_out_storage(d_inputs.size()); auto out_splits = dh::ToSpan(splits_out_storage); - this->LaunchEvaluateSplits(d_inputs,left, right, shared_inputs,evaluator, out_splits); + this->LaunchEvaluateSplits(number_active_features,d_inputs,shared_inputs,evaluator, out_splits); auto d_sorted_idx = this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()); auto d_entries = out_entries; @@ -416,7 +401,7 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( auto out_split = dh::ToSpan(splits_out); auto evaluator = tree_evaluator_.GetEvaluator(); dh::device_vector inputs = std::vector{input}; - this->LaunchEvaluateSplits(dh::ToSpan(inputs),input, {},shared_inputs, evaluator, out_split); + this->LaunchEvaluateSplits(input.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator, out_split); auto device_cats_accessor = this->DeviceCatStorage({input.nidx}); auto d_sorted_idx = this->SortedIdx(inputs.size(), shared_inputs.feature_values.size()); diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 84fb6aef9e3d..5fdd3ade8b58 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -167,16 +167,14 @@ class GPUHistEvaluator { TreeEvaluator::SplitEvaluator evaluator); // impl of evaluate splits, contains CUDA kernels so it's public - void LaunchEvaluateSplits(common::Span d_inputs,EvaluateSplitInputs left, - EvaluateSplitInputs right,EvaluateSplitSharedInputs shared_inputs, + void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span d_inputs,EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits); /** * \brief Evaluate splits for left and right nodes. */ - void EvaluateSplits(const std::vector &nidx,common::Span d_inputs,GPUExpandEntry candidate, - EvaluateSplitInputs left, - EvaluateSplitInputs right,EvaluateSplitSharedInputs shared_inputs, + void EvaluateSplits(const std::vector &nidx,bst_feature_t number_active_features,common::Span d_inputs,GPUExpandEntry candidate, + EvaluateSplitSharedInputs shared_inputs, common::Span out_splits); /** * \brief Evaluate splits for root node. diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 6ca6ec9f7d93..0b4edde0564f 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -323,10 +323,13 @@ struct GPUHistMakerDevice { gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, matrix.min_fvalue, }; - + bst_feature_t number_active_features = h_node_inputs[0].feature_set.size(); + CHECK_EQ(h_node_inputs[1].feature_set.size(), number_active_features) + << "Current implementation assumes that the number of active features " + "(after sampling) in any node is the same"; dh::TemporaryArray entries(2); std::vector nidx = {left_nidx, right_nidx}; - this->evaluator_.EvaluateSplits(nidx,dh::ToSpan(d_node_inputs), candidate, left, right, shared_inputs, dh::ToSpan(entries)); + this->evaluator_.EvaluateSplits(nidx,number_active_features,dh::ToSpan(d_node_inputs), candidate, shared_inputs, dh::ToSpan(entries)); dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.subspan(i * 2, 2).data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index a0440d5bce4a..3ba33c6121fb 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -250,7 +250,7 @@ TEST(GpuHist, EvaluateSplits) { GPUHistEvaluator evaluator{ tparam, static_cast(feature_min_values.size()), 0}; dh::device_vector inputs = std::vector{input_left,input_right}; - evaluator.LaunchEvaluateSplits(dh::ToSpan(inputs),input_left, input_right,shared_inputs, evaluator.GetEvaluator(), + evaluator.LaunchEvaluateSplits(input_left.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator.GetEvaluator(), dh::ToSpan(out_splits)); DeviceSplitCandidate result_left = out_splits[0]; From a14f41fc60067401cbc6fb32c5814d1d44580454 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 22 Jun 2022 06:29:04 -0700 Subject: [PATCH 06/17] Partially remove single split --- src/tree/gpu_hist/evaluate_splits.cu | 47 +++++-------------- src/tree/gpu_hist/evaluate_splits.cuh | 3 +- src/tree/updater_gpu_hist.cu | 12 ++--- .../cpp/tree/gpu_hist/test_evaluate_splits.cu | 14 +++--- 4 files changed, 25 insertions(+), 51 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 7714024edec5..4868ab177093 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -268,9 +268,9 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a, /** * \brief Set the bits for categorical splits based on the split threshold. */ -__device__ void SetCategoricalSplit(EvaluateSplitInputs const &input,const EvaluateSplitSharedInputs &shared_inputs, +__device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inputs, common::Span d_sorted_idx, bst_feature_t fidx, - bool is_left, common::Span out, + std::size_t input_idx, common::Span out, DeviceSplitCandidate *p_out_split) { auto &out_split = *p_out_split; out_split.split_cats = common::CatBitField{out}; @@ -282,9 +282,8 @@ __device__ void SetCategoricalSplit(EvaluateSplitInputs const &input,const Evalu } auto node_sorted_idx = - is_left ? d_sorted_idx.subspan(0, shared_inputs.feature_values.size()) - : d_sorted_idx.subspan(shared_inputs.feature_values.size(), shared_inputs.feature_values.size()); - size_t node_offset = is_left ? 0 : shared_inputs.feature_values.size(); + d_sorted_idx.subspan(shared_inputs.feature_values.size()*input_idx, shared_inputs.feature_values.size()); + size_t node_offset = input_idx* shared_inputs.feature_values.size(); auto best_thresh = out_split.PopBestThresh(); auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], shared_inputs.FeatureBins(fidx)); @@ -359,7 +358,7 @@ void GPUHistEvaluator::CopyToHost( const std::vector& } template -void GPUHistEvaluator::EvaluateSplits(const std::vector &nidx, bst_feature_t number_active_features,common::Span d_inputs,GPUExpandEntry candidate, +void GPUHistEvaluator::EvaluateSplits(const std::vector &nidx, bst_feature_t number_active_features,common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, common::Span out_entries) { auto evaluator = this->tree_evaluator_.template GetEvaluator(); @@ -378,8 +377,7 @@ void GPUHistEvaluator::EvaluateSplits(const std::vector::EvaluateSplits(const std::vector::EvaluateSplits(const std::vector GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs,float weight) { - dh::TemporaryArray splits_out(1); - auto out_split = dh::ToSpan(splits_out); - auto evaluator = tree_evaluator_.GetEvaluator(); dh::device_vector inputs = std::vector{input}; - this->LaunchEvaluateSplits(input.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator, out_split); - - auto device_cats_accessor = this->DeviceCatStorage({input.nidx}); - auto d_sorted_idx = this->SortedIdx(inputs.size(), shared_inputs.feature_values.size()); - - dh::TemporaryArray entries(1); - auto d_entries = entries.data().get(); - dh::LaunchN(1, [=] __device__(size_t i) mutable{ - auto &split = out_split[i]; - auto fidx = out_split[i].findex; - - if (split.is_cat) { - SetCategoricalSplit(input,shared_inputs, d_sorted_idx, fidx, true, device_cats_accessor.GetNodeCatStorage(input.nidx), &out_split[i]); - } - - float left_weight = evaluator.CalcWeight(0, shared_inputs.param, GradStats{split.left_sum}); - float right_weight = evaluator.CalcWeight(0, shared_inputs.param, GradStats{split.right_sum}); - d_entries[0] = GPUExpandEntry(0, 0, split, weight, left_weight, right_weight); - }); - this->CopyToHost({input.nidx}); - + dh::TemporaryArray out_entries(1); + this->EvaluateSplits({input.nidx},input.feature_set.size(),dh::ToSpan(inputs),shared_inputs,dh::ToSpan(out_entries)); GPUExpandEntry root_entry; - dh::safe_cuda(cudaMemcpyAsync(&root_entry, entries.data().get(), - sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); + dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), + sizeof(GPUExpandEntry), cudaMemcpyDeviceToHost)); return root_entry; + } template class GPUHistEvaluator; diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 5fdd3ade8b58..968d428bb5e1 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -21,6 +21,7 @@ namespace tree { // Inputs specific to each node struct EvaluateSplitInputs { int nidx; + int depth; GradientPairPrecise parent_sum; common::Span feature_set; common::Span gradient_histogram; @@ -173,7 +174,7 @@ class GPUHistEvaluator { /** * \brief Evaluate splits for left and right nodes. */ - void EvaluateSplits(const std::vector &nidx,bst_feature_t number_active_features,common::Span d_inputs,GPUExpandEntry candidate, + void EvaluateSplits(const std::vector &nidx,bst_feature_t number_active_features,common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, common::Span out_splits); /** diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 0b4edde0564f..cc559ee66a8b 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -279,7 +279,7 @@ struct GPUHistMakerDevice { common::Span feature_set = interaction_constraints.Query(sampled_features->DeviceSpan(), nidx); auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); - EvaluateSplitInputs inputs{nidx, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; + EvaluateSplitInputs inputs{nidx, 1, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; EvaluateSplitSharedInputs shared_inputs{ gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, matrix.min_fvalue, @@ -307,18 +307,14 @@ struct GPUHistMakerDevice { interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); auto h_node_inputs = pinned2.GetSpan(2); - h_node_inputs[0] = {left_nidx, candidate.split.left_sum, left_feature_set, + h_node_inputs[0] = {left_nidx, candidate.depth+1,candidate.split.left_sum, left_feature_set, hist.GetNodeHistogram(left_nidx)}; - h_node_inputs[1] = {right_nidx, candidate.split.right_sum, right_feature_set, + h_node_inputs[1] = {right_nidx, candidate.depth+1,candidate.split.right_sum, right_feature_set, hist.GetNodeHistogram(right_nidx)}; dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); - EvaluateSplitInputs left{left_nidx, candidate.split.left_sum, left_feature_set, - hist.GetNodeHistogram(left_nidx)}; - EvaluateSplitInputs right{right_nidx, candidate.split.right_sum, right_feature_set, - hist.GetNodeHistogram(right_nidx)}; EvaluateSplitSharedInputs shared_inputs{ gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, matrix.min_fvalue, @@ -329,7 +325,7 @@ struct GPUHistMakerDevice { "(after sampling) in any node is the same"; dh::TemporaryArray entries(2); std::vector nidx = {left_nidx, right_nidx}; - this->evaluator_.EvaluateSplits(nidx,number_active_features,dh::ToSpan(d_node_inputs), candidate, shared_inputs, dh::ToSpan(entries)); + this->evaluator_.EvaluateSplits(nidx,number_active_features,dh::ToSpan(d_node_inputs), shared_inputs, dh::ToSpan(entries)); dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.subspan(i * 2, 2).data(), entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index 3ba33c6121fb..f2750ed67bcc 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -50,7 +50,7 @@ void TestEvaluateSingleSplit(bool is_categorical) { d_feature_types = dh::ToSpan(feature_types); } - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1,0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; @@ -98,7 +98,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { thrust::device_vector feature_histogram = std::vector{{-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1,0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; @@ -148,7 +148,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { std::vector{ {-10.0, 0.5}, {10.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(2, 0); - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1,0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; @@ -188,7 +188,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { std::vector{ {-0.5, 0.5}, {0.5, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(2, 0); - EvaluateSplitInputs input{1, + EvaluateSplitInputs input{1,0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram)}; @@ -230,12 +230,12 @@ TEST(GpuHist, EvaluateSplits) { {-1.0, 0.5}, {1.0, 0.5}, {-0.5, 0.5}, {0.5, 0.5}}; thrust::device_vector monotonic_constraints(feature_set.size(), 0); EvaluateSplitInputs input_left{ - 1, + 1,0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram_left)}; EvaluateSplitInputs input_right{ - 2, + 2,0, parent_sum, dh::ToSpan(feature_set), dh::ToSpan(feature_histogram_right)}; @@ -279,7 +279,7 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { cudaMemcpyHostToDevice)); dh::device_vector feature_set{std::vector{0}}; - EvaluateSplitInputs input{0, + EvaluateSplitInputs input{0,0, total_gpair_, dh::ToSpan(feature_set), dh::ToSpan(d_hist)}; From 525435322b89fb9f9edd104c30b5833ef3e7c033 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Wed, 22 Jun 2022 06:42:34 -0700 Subject: [PATCH 07/17] Batched working. --- src/tree/updater_gpu_hist.cu | 59 +++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 28 deletions(-) diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index cc559ee66a8b..371f65c3f5b7 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -290,13 +290,23 @@ struct GPUHistMakerDevice { void EvaluateLeftRightSplits(const std::vector& candidates, const RegTree& tree, common::Span pinned_candidates_out) { - dh::TemporaryArray d_node_inputs(2); + if (candidates.empty()) return; + dh::TemporaryArray d_node_inputs(2 * candidates.size()); + dh::TemporaryArray splits_out(2 * candidates.size()); + std::vector nidx(2 * candidates.size()); + auto h_node_inputs = pinned2.GetSpan(2 * candidates.size()); + auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); + EvaluateSplitSharedInputs shared_inputs{ + GPUTrainingParam(param), feature_types, matrix.feature_segments, + matrix.gidx_fvalue_map, matrix.min_fvalue, + }; + dh::TemporaryArray entries(2 * candidates.size()); for (int i = 0; i < candidates.size(); i++) { auto candidate = candidates.at(i); int left_nidx = tree[candidate.nid].LeftChild(); int right_nidx = tree[candidate.nid].RightChild(); - dh::TemporaryArray splits_out(2); - GPUTrainingParam gpu_param(param); + nidx[i * 2] = left_nidx; + nidx[i * 2 + 1] = right_nidx; auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx)); left_sampled_features->SetDevice(ctx_->gpu_id); common::Span left_feature_set = @@ -305,32 +315,25 @@ struct GPUHistMakerDevice { right_sampled_features->SetDevice(ctx_->gpu_id); common::Span right_feature_set = interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx); - auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); - auto h_node_inputs = pinned2.GetSpan(2); - h_node_inputs[0] = {left_nidx, candidate.depth+1,candidate.split.left_sum, left_feature_set, - hist.GetNodeHistogram(left_nidx)}; - h_node_inputs[1] = {right_nidx, candidate.depth+1,candidate.split.right_sum, right_feature_set, - hist.GetNodeHistogram(right_nidx)}; - dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), - h_node_inputs.size() * sizeof(EvaluateSplitInputs), - cudaMemcpyDefault)); - - EvaluateSplitSharedInputs shared_inputs{ - gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, - matrix.min_fvalue, - }; - bst_feature_t number_active_features = h_node_inputs[0].feature_set.size(); - CHECK_EQ(h_node_inputs[1].feature_set.size(), number_active_features) - << "Current implementation assumes that the number of active features " - "(after sampling) in any node is the same"; - dh::TemporaryArray entries(2); - std::vector nidx = {left_nidx, right_nidx}; - this->evaluator_.EvaluateSplits(nidx,number_active_features,dh::ToSpan(d_node_inputs), shared_inputs, dh::ToSpan(entries)); - dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.subspan(i * 2, 2).data(), - entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), - cudaMemcpyDeviceToHost)); - dh::DefaultStream().Sync(); + h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum, + left_feature_set, hist.GetNodeHistogram(left_nidx)}; + h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum, + right_feature_set, hist.GetNodeHistogram(right_nidx)}; } + bst_feature_t number_active_features = h_node_inputs[0].feature_set.size(); + CHECK_EQ(h_node_inputs[1].feature_set.size(), number_active_features) + << "Current implementation assumes that the number of active features " + "(after sampling) in any node is the same"; + dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), + h_node_inputs.size() * sizeof(EvaluateSplitInputs), + cudaMemcpyDefault)); + + this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs), + shared_inputs, dh::ToSpan(entries)); + dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), + entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), + cudaMemcpyDeviceToHost)); + dh::DefaultStream().Sync(); } void BuildHist(int nidx) { From 7e3f54831ad8aff6ad979ca276ca04fca6585fc0 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 23 Jun 2022 05:27:46 -0700 Subject: [PATCH 08/17] All working. --- src/tree/gpu_hist/evaluate_splits.cu | 11 ++++++----- src/tree/gpu_hist/evaluate_splits.cuh | 25 ++++++++++++------------- src/tree/updater_gpu_hist.cu | 16 +++++++++------- 3 files changed, 27 insertions(+), 25 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 4868ab177093..f1e96c843e87 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -344,16 +344,17 @@ void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_a } template -void GPUHistEvaluator::CopyToHost( const std::vector& nidx) { +void GPUHistEvaluator::CopyToHost(const std::vector &nidx) { if (!has_categoricals_) return; + auto d_cats = this->DeviceCatStorage(nidx); + auto h_cats = this->HostCatStorage(nidx); dh::CUDAEvent event; event.Record(dh::DefaultStream()); for (auto idx : nidx) { - auto h_cats = this->HostCatStorage(idx); - auto d_cats = this->DeviceCatStorage(nidx).GetNodeCatStorage(idx); copy_stream_.View().Wait(event); - dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), d_cats.data(), d_cats.size_bytes(), - cudaMemcpyDeviceToHost, copy_stream_.View())); + dh::safe_cuda(cudaMemcpyAsync( + h_cats.GetNodeCatStorage(idx).data(), d_cats.GetNodeCatStorage(idx).data(), + d_cats.GetNodeCatStorage(idx).size_bytes(), cudaMemcpyDeviceToHost, copy_stream_.View())); } } diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index 968d428bb5e1..f87064ccf079 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -40,7 +40,9 @@ struct EvaluateSplitSharedInputs { } }; -struct DeviceCatAccessor { +// Used to return internal storage regions for categoricals +// Usable on device +struct CatAccessor { common::Span cat_storage_; std::size_t node_categorical_storage_size_; XGBOOST_DEVICE common::Span GetNodeCatStorage(bst_node_t nidx) { @@ -86,31 +88,28 @@ class GPUHistEvaluator { /** * \brief Get host category storage of nidx for internal calculation. */ - auto HostCatStorage(bst_node_t nidx) { - std::size_t min_size=(nidx+2)*node_categorical_storage_size_; - if(h_split_cats_.size() &nidx) { + if (!has_categoricals_) return CatAccessor{}; + auto max_nidx = *std::max_element(nidx.begin(), nidx.end()); + std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_; + if (h_split_cats_.size() < min_size) { h_split_cats_.resize(min_size); } - - if (nidx == RegTree::kRoot) { - auto cats_out = common::Span{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_); - return cats_out; - } - auto cats_out = common::Span{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2); - return cats_out; + return CatAccessor{{h_split_cats_.data(), h_split_cats_.size()}, + node_categorical_storage_size_}; } /** * \brief Get device category storage of nidx for internal calculation. */ auto DeviceCatStorage(const std::vector &nidx) { - if (!has_categoricals_) return DeviceCatAccessor{}; + if (!has_categoricals_) return CatAccessor{}; auto max_nidx = *std::max_element(nidx.begin(), nidx.end()); std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_; if (split_cats_.size() < min_size) { split_cats_.resize(min_size); } - return DeviceCatAccessor{dh::ToSpan(split_cats_), node_categorical_storage_size_}; + return CatAccessor{dh::ToSpan(split_cats_), node_categorical_storage_size_}; } /** diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 371f65c3f5b7..f915a409a097 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -279,7 +279,7 @@ struct GPUHistMakerDevice { common::Span feature_set = interaction_constraints.Query(sampled_features->DeviceSpan(), nidx); auto matrix = page->GetDeviceAccessor(ctx_->gpu_id); - EvaluateSplitInputs inputs{nidx, 1, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; + EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)}; EvaluateSplitSharedInputs shared_inputs{ gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map, matrix.min_fvalue, @@ -288,7 +288,7 @@ struct GPUHistMakerDevice { return split; } - void EvaluateLeftRightSplits(const std::vector& candidates, const RegTree& tree, + void EvaluateSplits(const std::vector& candidates, const RegTree& tree, common::Span pinned_candidates_out) { if (candidates.empty()) return; dh::TemporaryArray d_node_inputs(2 * candidates.size()); @@ -321,9 +321,11 @@ struct GPUHistMakerDevice { right_feature_set, hist.GetNodeHistogram(right_nidx)}; } bst_feature_t number_active_features = h_node_inputs[0].feature_set.size(); - CHECK_EQ(h_node_inputs[1].feature_set.size(), number_active_features) - << "Current implementation assumes that the number of active features " - "(after sampling) in any node is the same"; + for (auto input : h_node_inputs) { + CHECK_EQ(input.feature_set.size(), number_active_features) + << "Current implementation assumes that the number of active features " + "(after sampling) in any node is the same"; + } dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(), h_node_inputs.size() * sizeof(EvaluateSplitInputs), cudaMemcpyDefault)); @@ -334,7 +336,7 @@ struct GPUHistMakerDevice { entries.data().get(), sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost)); dh::DefaultStream().Sync(); - } + } void BuildHist(int nidx) { auto d_node_hist = hist.GetNodeHistogram(nidx); @@ -703,7 +705,7 @@ struct GPUHistMakerDevice { monitor.Stop("BuildHist"); monitor.Start("EvaluateSplits"); - this->EvaluateLeftRightSplits(filtered_expand_set, *p_tree, new_candidates); + this->EvaluateSplits(filtered_expand_set, *p_tree, new_candidates); monitor.Stop("EvaluateSplits"); dh::DefaultStream().Sync(); driver.Push(new_candidates.begin(), new_candidates.end()); From 9a3eb149a0116e792bba28326892a79a2ed52df2 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Thu, 23 Jun 2022 09:14:20 -0700 Subject: [PATCH 09/17] Launch bounds doubles occupancy. --- src/tree/gpu_hist/evaluate_splits.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index f1e96c843e87..90687376c1ff 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -199,7 +199,7 @@ __device__ void EvaluateFeature( } template -__global__ void EvaluateSplitsKernel(bst_feature_t number_active_features,common::Span d_inputs, +__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,common::Span d_inputs, const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, TreeEvaluator::SplitEvaluator evaluator, @@ -233,6 +233,7 @@ __global__ void EvaluateSplitsKernel(bst_feature_t number_active_features,common const auto input_idx = blockIdx.x / number_active_features; const EvaluateSplitInputs &inputs = d_inputs[input_idx]; // One block for each feature. Features are sampled, so fidx != blockIdx.x + int fidx = inputs.feature_set[blockIdx.x % number_active_features]; if (common::IsCat(shared_inputs.feature_types, fidx)) { From d5e8eac58f7fde53a343bc113390568d32c75235 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 24 Jun 2022 03:01:29 -0700 Subject: [PATCH 10/17] Use ldg --- src/tree/gpu_hist/evaluate_splits.cu | 29 ++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 90687376c1ff..ef509c3a9e60 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -73,6 +73,26 @@ ReduceFeature(common::Span feature_histogram, cub::CTA_SYNC(); return shared_sum; } +// Force nvcc to load data as constant +template +class LDGIterator { + typedef typename cub::UnitWord::DeviceWord DeviceWordT; + static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); + + const T* ptr; + + public: + XGBOOST_DEVICE LDGIterator(const T* ptr) : ptr(ptr) {} + __device__ T operator[](std::size_t idx) const { + DeviceWordT tmp[kNumWords]; +#pragma unroll + for (int i = 0; i < kNumWords; i++) { + tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); + } + return *reinterpret_cast(tmp); + } +}; + /*! \brief Find the thread with best gain. */ template ldg_feature_segments(shared_inputs.feature_segments.data()); + uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin uint32_t gidx_end = - shared_inputs.feature_segments[fidx + 1]; // end bin for i^th feature + ldg_feature_segments[fidx + 1]; // end bin for i^th feature auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); // Sum histogram bins for current feature @@ -199,7 +220,7 @@ __device__ void EvaluateFeature( } template -__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,common::Span d_inputs, +__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,LDGIterator d_inputs, const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, TreeEvaluator::SplitEvaluator evaluator, @@ -325,7 +346,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_a // One block for each feature uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, number_active_features,d_inputs, shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), + EvaluateSplitsKernel, number_active_features,LDGIterator(d_inputs.data()), shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features From dd1138300cd18c121068b03499f0a0ef72947c7a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 24 Jun 2022 03:33:58 -0700 Subject: [PATCH 11/17] Separate EvaluateFeature implementations. --- src/common/device_helpers.cuh | 21 ++ src/tree/gpu_hist/evaluate_splits.cu | 293 +++++++++++++++++---------- 2 files changed, 210 insertions(+), 104 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 123dc14e57be..da19fe2b510e 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1939,4 +1939,25 @@ class CUDAStream { CUDAStreamView View() const { return CUDAStreamView{stream_}; } void Sync() { this->View().Sync(); } }; + +// Force nvcc to load data as constant +template +class LDGIterator { + typedef typename cub::UnitWord::DeviceWord DeviceWordT; + static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); + + const T* ptr; + + public: + XGBOOST_DEVICE LDGIterator(const T* ptr) : ptr(ptr) {} + __device__ T operator[](std::size_t idx) const { + DeviceWordT tmp[kNumWords]; +#pragma unroll + for (int i = 0; i < kNumWords; i++) { + tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); + } + return *reinterpret_cast(tmp); + } +}; + } // namespace dh diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index ef509c3a9e60..38c9fbe3fd2c 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -47,7 +47,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, * \param end * \param temp_storage Shared memory for intermediate result. */ -template __device__ GradientSumT ReduceFeature(common::Span feature_histogram, @@ -60,9 +60,14 @@ ReduceFeature(common::Span feature_histogram, auto begin = feature_histogram.data(); auto end = begin + feature_histogram.size(); for (auto itr = begin; itr < end; itr += BLOCK_THREADS) { - bool thread_active = itr + threadIdx.x < end; - // Scan histogram - GradientSumT bin = thread_active ? *(itr + threadIdx.x) : GradientSumT(); + double tmp[2]; + GradientSumT &bin = *reinterpret_cast(&tmp[0]); + BlockLoadT(temp_storage->load) + .Load(reinterpret_cast(itr), tmp, + min(int(end - itr), int(BLOCK_THREADS)) * 2, 0.0); + __syncthreads(); + //bool thread_active = itr + threadIdx.x < end; + //GradientSumT bin = thread_active ? *(itr + threadIdx.x) : GradientSumT(); local_sum += bin; } local_sum = ReduceT(temp_storage->sum_reduce).Reduce(local_sum, cub::Sum()); @@ -73,31 +78,82 @@ ReduceFeature(common::Span feature_histogram, cub::CTA_SYNC(); return shared_sum; } -// Force nvcc to load data as constant -template -class LDGIterator { - typedef typename cub::UnitWord::DeviceWord DeviceWordT; - static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); - - const T* ptr; - - public: - XGBOOST_DEVICE LDGIterator(const T* ptr) : ptr(ptr) {} - __device__ T operator[](std::size_t idx) const { - DeviceWordT tmp[kNumWords]; -#pragma unroll - for (int i = 0; i < kNumWords; i++) { - tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); + +/*! \brief Find the thread with best gain. */ +template +__device__ void EvaluatePartitionFeature( + int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, + TreeEvaluator::SplitEvaluator evaluator, + common::Span sorted_idx, size_t offset, + DeviceSplitCandidate *best_split, // shared memory storing best split + TempStorageT *temp_storage // temp memory for cub operations +) { + // Use pointer from cut to indicate begin and end of bins for each feature. + dh::LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); + uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin + uint32_t gidx_end = + ldg_feature_segments[fidx + 1]; // end bin for i^th feature + auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); + + // Sum histogram bins for current feature + GradientSumT const feature_sum = + ReduceFeature(feature_hist, temp_storage); + + GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; + float const null_gain = -std::numeric_limits::infinity(); + + SumCallbackOp prefix_op = SumCallbackOp(); + for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { + bool thread_active = (scan_begin + threadIdx.x) < gidx_end; + + auto rest = thread_active + ? inputs.gradient_histogram[sorted_idx[scan_begin + threadIdx.x] - offset] + : GradientSumT(); + // No min value for cat feature, use inclusive scan. + ScanT(temp_storage->scan).InclusiveScan(rest, rest, cub::Sum(), prefix_op); + GradientSumT bin = + GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT + // Whether the gradient of missing values is put to the left side. + bool missing_left = true; + float gain = null_gain; + if (thread_active) { + gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, shared_inputs.param, + inputs.nidx, fidx, evaluator, missing_left); } - return *reinterpret_cast(tmp); - } -}; + __syncthreads(); + + // Find thread with best gain + cub::KeyValuePair tuple(threadIdx.x, gain); + cub::KeyValuePair best = + MaxReduceT(temp_storage->max_reduce).Reduce(tuple, cub::ArgMax()); + + __shared__ cub::KeyValuePair block_max; + if (threadIdx.x == 0) { + block_max = best; + } + + cub::CTA_SYNC(); + // Best thread updates the split + if (threadIdx.x == block_max.key) { + int32_t split_gidx = (scan_begin + threadIdx.x); + float fvalue = shared_inputs.feature_values[split_gidx]; + GradientPairPrecise left = + missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; + GradientPairPrecise right = inputs.parent_sum - left; + auto best_thresh = block_max.key; // index of best threshold inside a feature. + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, right, + true, shared_inputs.param); + } + cub::CTA_SYNC(); + } +} /*! \brief Find the thread with best gain. */ -template -__device__ void EvaluateFeature( +template +__device__ void EvaluateOneHotFeature( int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span sorted_idx, size_t offset, @@ -105,7 +161,7 @@ __device__ void EvaluateFeature( TempStorageT *temp_storage // temp memory for cub operations ) { // Use pointer from cut to indicate begin and end of bins for each feature. - LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); + dh::LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin uint32_t gidx_end = ldg_feature_segments[fidx + 1]; // end bin for i^th feature @@ -113,7 +169,7 @@ __device__ void EvaluateFeature( // Sum histogram bins for current feature GradientSumT const feature_sum = - ReduceFeature(feature_hist, temp_storage); + ReduceFeature(feature_hist, temp_storage); GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; float const null_gain = -std::numeric_limits::infinity(); @@ -122,34 +178,10 @@ __device__ void EvaluateFeature( for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { bool thread_active = (scan_begin + threadIdx.x) < gidx_end; - auto calc_bin_value = [&]() { - GradientSumT bin; - switch (type) { - case kOneHot: { - auto rest = - thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); - bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT - break; - } - case kNum: { - bin = - thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); - ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); - break; - } - case kPart: { - auto rest = thread_active - ? inputs.gradient_histogram[sorted_idx[scan_begin + threadIdx.x] - offset] - : GradientSumT(); - // No min value for cat feature, use inclusive scan. - ScanT(temp_storage->scan).InclusiveScan(rest, rest, cub::Sum(), prefix_op); - bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT - break; - } - } - return bin; - }; - auto bin = calc_bin_value(); + auto rest = + thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); + GradientSumT bin = + GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT // Whether the gradient of missing values is put to the left side. bool missing_left = true; float gain = null_gain; @@ -174,25 +206,6 @@ __device__ void EvaluateFeature( // Best thread updates the split if (threadIdx.x == block_max.key) { - switch (type) { - case kNum: { - // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin - int split_gidx = (scan_begin + threadIdx.x) - 1; - float fvalue; - if (split_gidx < static_cast(gidx_begin)) { - fvalue = shared_inputs.min_fvalue[fidx]; - } else { - fvalue = shared_inputs.feature_values[split_gidx]; - } - GradientPairPrecise left = - missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; - GradientPairPrecise right = inputs.parent_sum - left; - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, - false, shared_inputs.param); - break; - } - case kOneHot: { int32_t split_gidx = (scan_begin + threadIdx.x); float fvalue = shared_inputs.feature_values[split_gidx]; GradientPairPrecise left = @@ -200,40 +213,104 @@ __device__ void EvaluateFeature( GradientPairPrecise right = inputs.parent_sum - left; best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, true, shared_inputs.param); - break; - } - case kPart: { - int32_t split_gidx = (scan_begin + threadIdx.x); - float fvalue = shared_inputs.feature_values[split_gidx]; - GradientPairPrecise left = - missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; - GradientPairPrecise right = inputs.parent_sum - left; - auto best_thresh = block_max.key; // index of best threshold inside a feature. - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, - right, true, shared_inputs.param); - break; - } - } } cub::CTA_SYNC(); } } +template +__device__ void EvaluateNumericalFeature( + int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, + TreeEvaluator::SplitEvaluator evaluator, + common::Span sorted_idx, size_t offset, + DeviceSplitCandidate *best_split, // shared memory storing best split + TempStorageT *temp_storage // temp memory for cub operations +) { + // Use pointer from cut to indicate begin and end of bins for each feature. + dh::LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); + uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin + uint32_t gidx_end = + ldg_feature_segments[fidx + 1]; // end bin for i^th feature + auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); + + // Sum histogram bins for current feature + GradientSumT const feature_sum = + ReduceFeature(feature_hist, temp_storage); + + GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; + float const null_gain = -std::numeric_limits::infinity(); + + SumCallbackOp prefix_op = SumCallbackOp(); + for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { + bool thread_active = (scan_begin + threadIdx.x) < gidx_end; + + double tmp[2]; + GradientSumT &bin = *reinterpret_cast(&tmp[0]); + BlockLoadT(temp_storage->load) + .Load(reinterpret_cast(inputs.gradient_histogram.data() + scan_begin), tmp, + min(gidx_end - scan_begin, BLOCK_THREADS) * 2, 0.0); + __syncthreads(); + ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); + // Whether the gradient of missing values is put to the left side. + bool missing_left = true; + float gain = null_gain; + if (thread_active) { + gain = LossChangeMissing(bin, missing, inputs.parent_sum, shared_inputs.param, + inputs.nidx, fidx, evaluator, missing_left); + } + + __syncthreads(); + + // Find thread with best gain + cub::KeyValuePair tuple(threadIdx.x, gain); + cub::KeyValuePair best = + MaxReduceT(temp_storage->max_reduce).Reduce(tuple, cub::ArgMax()); + + __shared__ cub::KeyValuePair block_max; + if (threadIdx.x == 0) { + block_max = best; + } + + cub::CTA_SYNC(); + + // Best thread updates the split + if (threadIdx.x == block_max.key) { + // Use pointer from cut to indicate begin and end of bins for each feature. + uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin + int split_gidx = (scan_begin + threadIdx.x) - 1; + float fvalue; + if (split_gidx < static_cast(gidx_begin)) { + fvalue = shared_inputs.min_fvalue[fidx]; + } else { + fvalue = shared_inputs.feature_values[split_gidx]; + } + GradientPairPrecise left = + missing_left ? bin + missing : bin; + GradientPairPrecise right = inputs.parent_sum - left; + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, + false, shared_inputs.param); + } + } + cub::CTA_SYNC(); +} + template -__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,LDGIterator d_inputs, - const EvaluateSplitSharedInputs shared_inputs, - common::Span sorted_idx, - TreeEvaluator::SplitEvaluator evaluator, - common::Span out_candidates) { +__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel( + bst_feature_t number_active_features, dh::LDGIterator d_inputs, + const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, + TreeEvaluator::SplitEvaluator evaluator, + common::Span out_candidates) { // KeyValuePair here used as threadIdx.x -> gain_value using ArgMaxT = cub::KeyValuePair; - using BlockScanT = - cub::BlockScan; + using BlockScanT = cub::BlockScan; + using BlockLoadT = cub::BlockLoad; using MaxReduceT = cub::BlockReduce; using SumReduceT = cub::BlockReduce; union TempStorage { + typename BlockLoadT::TempStorage load; typename BlockScanT::TempStorage scan; typename MaxReduceT::TempStorage max_reduce; typename SumReduceT::TempStorage sum_reduce; @@ -255,25 +332,33 @@ __global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_featur const EvaluateSplitInputs &inputs = d_inputs[input_idx]; // One block for each feature. Features are sampled, so fidx != blockIdx.x - int fidx = inputs.feature_set[blockIdx.x % number_active_features]; + int fidx = 0; + // Avoid global memory load when columns aren't sampled + if (inputs.feature_set.size() == shared_inputs.Features()) { + fidx = blockIdx.x % number_active_features; + } else { + fidx = inputs.feature_set[blockIdx.x % number_active_features]; + } + /* if (common::IsCat(shared_inputs.feature_types, fidx)) { auto n_bins_in_feat = shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx]; if (common::UseOneHot(n_bins_in_feat, shared_inputs.param.max_cat_to_onehot)) { - EvaluateFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + EvaluateOneHotFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); } else { auto total_bins = shared_inputs.feature_values.size(); size_t offset = total_bins * input_idx; auto node_sorted_idx = sorted_idx.subspan(offset, total_bins); - EvaluateFeature(fidx, inputs, shared_inputs, evaluator, node_sorted_idx, offset, + EvaluatePartitionFeature(fidx, inputs, shared_inputs, evaluator, node_sorted_idx, offset, &best_split, &temp_storage); } } else { - EvaluateFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); - } + */ + EvaluateNumericalFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + //} cub::CTA_SYNC(); if (threadIdx.x == 0) { @@ -346,7 +431,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_a // One block for each feature uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, number_active_features,LDGIterator(d_inputs.data()), shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), + EvaluateSplitsKernel, number_active_features,dh::LDGIterator(d_inputs.data()), shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features From 18252774f8acd3105177cfa5025dc1088e48aa3a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 27 Jun 2022 05:32:08 -0700 Subject: [PATCH 12/17] Revert "Separate EvaluateFeature implementations." This reverts commit dd1138300cd18c121068b03499f0a0ef72947c7a. --- src/common/device_helpers.cuh | 21 -- src/tree/gpu_hist/evaluate_splits.cu | 293 ++++++++++----------------- 2 files changed, 104 insertions(+), 210 deletions(-) diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index da19fe2b510e..123dc14e57be 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1939,25 +1939,4 @@ class CUDAStream { CUDAStreamView View() const { return CUDAStreamView{stream_}; } void Sync() { this->View().Sync(); } }; - -// Force nvcc to load data as constant -template -class LDGIterator { - typedef typename cub::UnitWord::DeviceWord DeviceWordT; - static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); - - const T* ptr; - - public: - XGBOOST_DEVICE LDGIterator(const T* ptr) : ptr(ptr) {} - __device__ T operator[](std::size_t idx) const { - DeviceWordT tmp[kNumWords]; -#pragma unroll - for (int i = 0; i < kNumWords; i++) { - tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); - } - return *reinterpret_cast(tmp); - } -}; - } // namespace dh diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 38c9fbe3fd2c..ef509c3a9e60 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -47,7 +47,7 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, * \param end * \param temp_storage Shared memory for intermediate result. */ -template __device__ GradientSumT ReduceFeature(common::Span feature_histogram, @@ -60,14 +60,9 @@ ReduceFeature(common::Span feature_histogram, auto begin = feature_histogram.data(); auto end = begin + feature_histogram.size(); for (auto itr = begin; itr < end; itr += BLOCK_THREADS) { - double tmp[2]; - GradientSumT &bin = *reinterpret_cast(&tmp[0]); - BlockLoadT(temp_storage->load) - .Load(reinterpret_cast(itr), tmp, - min(int(end - itr), int(BLOCK_THREADS)) * 2, 0.0); - __syncthreads(); - //bool thread_active = itr + threadIdx.x < end; - //GradientSumT bin = thread_active ? *(itr + threadIdx.x) : GradientSumT(); + bool thread_active = itr + threadIdx.x < end; + // Scan histogram + GradientSumT bin = thread_active ? *(itr + threadIdx.x) : GradientSumT(); local_sum += bin; } local_sum = ReduceT(temp_storage->sum_reduce).Reduce(local_sum, cub::Sum()); @@ -78,82 +73,31 @@ ReduceFeature(common::Span feature_histogram, cub::CTA_SYNC(); return shared_sum; } - -/*! \brief Find the thread with best gain. */ -template -__device__ void EvaluatePartitionFeature( - int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, - TreeEvaluator::SplitEvaluator evaluator, - common::Span sorted_idx, size_t offset, - DeviceSplitCandidate *best_split, // shared memory storing best split - TempStorageT *temp_storage // temp memory for cub operations -) { - // Use pointer from cut to indicate begin and end of bins for each feature. - dh::LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); - uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin - uint32_t gidx_end = - ldg_feature_segments[fidx + 1]; // end bin for i^th feature - auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); - - // Sum histogram bins for current feature - GradientSumT const feature_sum = - ReduceFeature(feature_hist, temp_storage); - - GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; - float const null_gain = -std::numeric_limits::infinity(); - - SumCallbackOp prefix_op = SumCallbackOp(); - for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { - bool thread_active = (scan_begin + threadIdx.x) < gidx_end; - - auto rest = thread_active - ? inputs.gradient_histogram[sorted_idx[scan_begin + threadIdx.x] - offset] - : GradientSumT(); - // No min value for cat feature, use inclusive scan. - ScanT(temp_storage->scan).InclusiveScan(rest, rest, cub::Sum(), prefix_op); - GradientSumT bin = - GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT - // Whether the gradient of missing values is put to the left side. - bool missing_left = true; - float gain = null_gain; - if (thread_active) { - gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, shared_inputs.param, - inputs.nidx, fidx, evaluator, missing_left); - } - - __syncthreads(); - - // Find thread with best gain - cub::KeyValuePair tuple(threadIdx.x, gain); - cub::KeyValuePair best = - MaxReduceT(temp_storage->max_reduce).Reduce(tuple, cub::ArgMax()); - - __shared__ cub::KeyValuePair block_max; - if (threadIdx.x == 0) { - block_max = best; +// Force nvcc to load data as constant +template +class LDGIterator { + typedef typename cub::UnitWord::DeviceWord DeviceWordT; + static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); + + const T* ptr; + + public: + XGBOOST_DEVICE LDGIterator(const T* ptr) : ptr(ptr) {} + __device__ T operator[](std::size_t idx) const { + DeviceWordT tmp[kNumWords]; +#pragma unroll + for (int i = 0; i < kNumWords; i++) { + tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); } + return *reinterpret_cast(tmp); + } +}; - cub::CTA_SYNC(); - // Best thread updates the split - if (threadIdx.x == block_max.key) { - int32_t split_gidx = (scan_begin + threadIdx.x); - float fvalue = shared_inputs.feature_values[split_gidx]; - GradientPairPrecise left = - missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; - GradientPairPrecise right = inputs.parent_sum - left; - auto best_thresh = block_max.key; // index of best threshold inside a feature. - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, right, - true, shared_inputs.param); - } - cub::CTA_SYNC(); - } -} /*! \brief Find the thread with best gain. */ -template -__device__ void EvaluateOneHotFeature( +template +__device__ void EvaluateFeature( int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span sorted_idx, size_t offset, @@ -161,7 +105,7 @@ __device__ void EvaluateOneHotFeature( TempStorageT *temp_storage // temp memory for cub operations ) { // Use pointer from cut to indicate begin and end of bins for each feature. - dh::LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); + LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin uint32_t gidx_end = ldg_feature_segments[fidx + 1]; // end bin for i^th feature @@ -169,7 +113,7 @@ __device__ void EvaluateOneHotFeature( // Sum histogram bins for current feature GradientSumT const feature_sum = - ReduceFeature(feature_hist, temp_storage); + ReduceFeature(feature_hist, temp_storage); GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; float const null_gain = -std::numeric_limits::infinity(); @@ -178,10 +122,34 @@ __device__ void EvaluateOneHotFeature( for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { bool thread_active = (scan_begin + threadIdx.x) < gidx_end; - auto rest = - thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); - GradientSumT bin = - GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT + auto calc_bin_value = [&]() { + GradientSumT bin; + switch (type) { + case kOneHot: { + auto rest = + thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); + bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT + break; + } + case kNum: { + bin = + thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); + ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); + break; + } + case kPart: { + auto rest = thread_active + ? inputs.gradient_histogram[sorted_idx[scan_begin + threadIdx.x] - offset] + : GradientSumT(); + // No min value for cat feature, use inclusive scan. + ScanT(temp_storage->scan).InclusiveScan(rest, rest, cub::Sum(), prefix_op); + bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT + break; + } + } + return bin; + }; + auto bin = calc_bin_value(); // Whether the gradient of missing values is put to the left side. bool missing_left = true; float gain = null_gain; @@ -206,6 +174,25 @@ __device__ void EvaluateOneHotFeature( // Best thread updates the split if (threadIdx.x == block_max.key) { + switch (type) { + case kNum: { + // Use pointer from cut to indicate begin and end of bins for each feature. + uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin + int split_gidx = (scan_begin + threadIdx.x) - 1; + float fvalue; + if (split_gidx < static_cast(gidx_begin)) { + fvalue = shared_inputs.min_fvalue[fidx]; + } else { + fvalue = shared_inputs.feature_values[split_gidx]; + } + GradientPairPrecise left = + missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; + GradientPairPrecise right = inputs.parent_sum - left; + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, + false, shared_inputs.param); + break; + } + case kOneHot: { int32_t split_gidx = (scan_begin + threadIdx.x); float fvalue = shared_inputs.feature_values[split_gidx]; GradientPairPrecise left = @@ -213,104 +200,40 @@ __device__ void EvaluateOneHotFeature( GradientPairPrecise right = inputs.parent_sum - left; best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, true, shared_inputs.param); - } - cub::CTA_SYNC(); - } -} - -template -__device__ void EvaluateNumericalFeature( - int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, - TreeEvaluator::SplitEvaluator evaluator, - common::Span sorted_idx, size_t offset, - DeviceSplitCandidate *best_split, // shared memory storing best split - TempStorageT *temp_storage // temp memory for cub operations -) { - // Use pointer from cut to indicate begin and end of bins for each feature. - dh::LDGIterator ldg_feature_segments(shared_inputs.feature_segments.data()); - uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin - uint32_t gidx_end = - ldg_feature_segments[fidx + 1]; // end bin for i^th feature - auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); - - // Sum histogram bins for current feature - GradientSumT const feature_sum = - ReduceFeature(feature_hist, temp_storage); - - GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; - float const null_gain = -std::numeric_limits::infinity(); - - SumCallbackOp prefix_op = SumCallbackOp(); - for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { - bool thread_active = (scan_begin + threadIdx.x) < gidx_end; - - double tmp[2]; - GradientSumT &bin = *reinterpret_cast(&tmp[0]); - BlockLoadT(temp_storage->load) - .Load(reinterpret_cast(inputs.gradient_histogram.data() + scan_begin), tmp, - min(gidx_end - scan_begin, BLOCK_THREADS) * 2, 0.0); - __syncthreads(); - ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); - // Whether the gradient of missing values is put to the left side. - bool missing_left = true; - float gain = null_gain; - if (thread_active) { - gain = LossChangeMissing(bin, missing, inputs.parent_sum, shared_inputs.param, - inputs.nidx, fidx, evaluator, missing_left); - } - - __syncthreads(); - - // Find thread with best gain - cub::KeyValuePair tuple(threadIdx.x, gain); - cub::KeyValuePair best = - MaxReduceT(temp_storage->max_reduce).Reduce(tuple, cub::ArgMax()); - - __shared__ cub::KeyValuePair block_max; - if (threadIdx.x == 0) { - block_max = best; - } - - cub::CTA_SYNC(); - - // Best thread updates the split - if (threadIdx.x == block_max.key) { - // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin - int split_gidx = (scan_begin + threadIdx.x) - 1; - float fvalue; - if (split_gidx < static_cast(gidx_begin)) { - fvalue = shared_inputs.min_fvalue[fidx]; - } else { - fvalue = shared_inputs.feature_values[split_gidx]; + break; + } + case kPart: { + int32_t split_gidx = (scan_begin + threadIdx.x); + float fvalue = shared_inputs.feature_values[split_gidx]; + GradientPairPrecise left = + missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; + GradientPairPrecise right = inputs.parent_sum - left; + auto best_thresh = block_max.key; // index of best threshold inside a feature. + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, + right, true, shared_inputs.param); + break; + } } - GradientPairPrecise left = - missing_left ? bin + missing : bin; - GradientPairPrecise right = inputs.parent_sum - left; - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, - false, shared_inputs.param); } + cub::CTA_SYNC(); } - cub::CTA_SYNC(); } template -__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel( - bst_feature_t number_active_features, dh::LDGIterator d_inputs, - const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, - TreeEvaluator::SplitEvaluator evaluator, - common::Span out_candidates) { +__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,LDGIterator d_inputs, + const EvaluateSplitSharedInputs shared_inputs, + common::Span sorted_idx, + TreeEvaluator::SplitEvaluator evaluator, + common::Span out_candidates) { // KeyValuePair here used as threadIdx.x -> gain_value using ArgMaxT = cub::KeyValuePair; - using BlockScanT = cub::BlockScan; - using BlockLoadT = cub::BlockLoad; + using BlockScanT = + cub::BlockScan; using MaxReduceT = cub::BlockReduce; using SumReduceT = cub::BlockReduce; union TempStorage { - typename BlockLoadT::TempStorage load; typename BlockScanT::TempStorage scan; typename MaxReduceT::TempStorage max_reduce; typename SumReduceT::TempStorage sum_reduce; @@ -332,33 +255,25 @@ __global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel( const EvaluateSplitInputs &inputs = d_inputs[input_idx]; // One block for each feature. Features are sampled, so fidx != blockIdx.x - int fidx = 0; - // Avoid global memory load when columns aren't sampled - if (inputs.feature_set.size() == shared_inputs.Features()) { - fidx = blockIdx.x % number_active_features; - } else { - fidx = inputs.feature_set[blockIdx.x % number_active_features]; - } + int fidx = inputs.feature_set[blockIdx.x % number_active_features]; - /* if (common::IsCat(shared_inputs.feature_types, fidx)) { auto n_bins_in_feat = shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx]; if (common::UseOneHot(n_bins_in_feat, shared_inputs.param.max_cat_to_onehot)) { - EvaluateOneHotFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + EvaluateFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); } else { auto total_bins = shared_inputs.feature_values.size(); size_t offset = total_bins * input_idx; auto node_sorted_idx = sorted_idx.subspan(offset, total_bins); - EvaluatePartitionFeature(fidx, inputs, shared_inputs, evaluator, node_sorted_idx, offset, + EvaluateFeature(fidx, inputs, shared_inputs, evaluator, node_sorted_idx, offset, &best_split, &temp_storage); } } else { - */ - EvaluateNumericalFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); - //} + EvaluateFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + } cub::CTA_SYNC(); if (threadIdx.x == 0) { @@ -431,7 +346,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_a // One block for each feature uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, number_active_features,dh::LDGIterator(d_inputs.data()), shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), + EvaluateSplitsKernel, number_active_features,LDGIterator(d_inputs.data()), shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features From 7a249ab9cdaa224a748c0d9011cb2974ae27c997 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 27 Jun 2022 05:32:15 -0700 Subject: [PATCH 13/17] Revert "Use ldg" This reverts commit d5e8eac58f7fde53a343bc113390568d32c75235. --- src/tree/gpu_hist/evaluate_splits.cu | 29 ++++------------------------ 1 file changed, 4 insertions(+), 25 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index ef509c3a9e60..90687376c1ff 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -73,26 +73,6 @@ ReduceFeature(common::Span feature_histogram, cub::CTA_SYNC(); return shared_sum; } -// Force nvcc to load data as constant -template -class LDGIterator { - typedef typename cub::UnitWord::DeviceWord DeviceWordT; - static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT); - - const T* ptr; - - public: - XGBOOST_DEVICE LDGIterator(const T* ptr) : ptr(ptr) {} - __device__ T operator[](std::size_t idx) const { - DeviceWordT tmp[kNumWords]; -#pragma unroll - for (int i = 0; i < kNumWords; i++) { - tmp[i] = __ldg(reinterpret_cast(ptr + idx) + i); - } - return *reinterpret_cast(tmp); - } -}; - /*! \brief Find the thread with best gain. */ template ldg_feature_segments(shared_inputs.feature_segments.data()); - uint32_t gidx_begin = ldg_feature_segments[fidx]; // beginning bin + uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin uint32_t gidx_end = - ldg_feature_segments[fidx + 1]; // end bin for i^th feature + shared_inputs.feature_segments[fidx + 1]; // end bin for i^th feature auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); // Sum histogram bins for current feature @@ -220,7 +199,7 @@ __device__ void EvaluateFeature( } template -__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,LDGIterator d_inputs, +__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,common::Span d_inputs, const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, TreeEvaluator::SplitEvaluator evaluator, @@ -346,7 +325,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_a // One block for each feature uint32_t constexpr kBlockThreads = 256; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, number_active_features,LDGIterator(d_inputs.data()), shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), + EvaluateSplitsKernel, number_active_features,d_inputs, shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features From b6a55a794d5ed8847bdaad07bac7756162fef85a Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Fri, 1 Jul 2022 04:12:54 -0700 Subject: [PATCH 14/17] Reduce block size. --- src/tree/gpu_hist/evaluate_splits.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 90687376c1ff..3983e0c332fb 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -199,7 +199,7 @@ __device__ void EvaluateFeature( } template -__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features,common::Span d_inputs, +__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features, common::Span d_inputs, const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, TreeEvaluator::SplitEvaluator evaluator, @@ -323,7 +323,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_a dh::TemporaryArray feature_best_splits(combined_num_features); // One block for each feature - uint32_t constexpr kBlockThreads = 256; + uint32_t constexpr kBlockThreads = 32; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( EvaluateSplitsKernel, number_active_features,d_inputs, shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); From a47b5c1f92f728476d04eb0d9395ca811367faaf Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 4 Jul 2022 03:02:45 -0700 Subject: [PATCH 15/17] Lint. --- src/tree/gpu_hist/evaluate_splits.cu | 134 ++++++++++++++------------- src/tree/gpu_hist/evaluator.cu | 6 +- 2 files changed, 73 insertions(+), 67 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index 3983e0c332fb..249a2aca7834 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -2,6 +2,7 @@ * Copyright 2020-2022 by XGBoost Contributors */ #include // std::max +#include #include #include "../../common/categorical.h" @@ -22,11 +23,10 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, TreeEvaluator::SplitEvaluator evaluator, bool &missing_left_out) { // NOLINT float parent_gain = CalcGain(param, parent_sum); - float missing_left_gain = - evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing), - GradStats(parent_sum - (scan + missing))); - float missing_right_gain = evaluator.CalcSplitGain( - param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan)); + float missing_left_gain = evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing), + GradStats(parent_sum - (scan + missing))); + float missing_right_gain = + evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan)); if (missing_left_gain > missing_right_gain) { missing_left_out = true; @@ -47,13 +47,11 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, * \param end * \param temp_storage Shared memory for intermediate result. */ -template -__device__ GradientSumT -ReduceFeature(common::Span feature_histogram, - TempStorageT* temp_storage) { +template +__device__ GradientSumT ReduceFeature(common::Span feature_histogram, + TempStorageT *temp_storage) { __shared__ cub::Uninitialized uninitialized_sum; - GradientSumT& shared_sum = uninitialized_sum.Alias(); + GradientSumT &shared_sum = uninitialized_sum.Alias(); GradientSumT local_sum = GradientSumT(); // For loop sums features into one block size @@ -78,16 +76,15 @@ ReduceFeature(common::Span feature_histogram, template __device__ void EvaluateFeature( - int fidx, const EvaluateSplitInputs &inputs,const EvaluateSplitSharedInputs &shared_inputs, + int fidx, const EvaluateSplitInputs &inputs, const EvaluateSplitSharedInputs &shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span sorted_idx, size_t offset, DeviceSplitCandidate *best_split, // shared memory storing best split TempStorageT *temp_storage // temp memory for cub operations ) { // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin - uint32_t gidx_end = - shared_inputs.feature_segments[fidx + 1]; // end bin for i^th feature + uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin + uint32_t gidx_end = shared_inputs.feature_segments[fidx + 1]; // end bin for i^th feature auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); // Sum histogram bins for current feature @@ -133,8 +130,8 @@ __device__ void EvaluateFeature( bool missing_left = true; float gain = null_gain; if (thread_active) { - gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, shared_inputs.param, - inputs.nidx, fidx, evaluator, missing_left); + gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, + shared_inputs.param, inputs.nidx, fidx, evaluator, missing_left); } __syncthreads(); @@ -199,15 +196,14 @@ __device__ void EvaluateFeature( } template -__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_feature_t number_active_features, common::Span d_inputs, - const EvaluateSplitSharedInputs shared_inputs, - common::Span sorted_idx, - TreeEvaluator::SplitEvaluator evaluator, - common::Span out_candidates) { +__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel( + bst_feature_t number_active_features, common::Span d_inputs, + const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, + TreeEvaluator::SplitEvaluator evaluator, + common::Span out_candidates) { // KeyValuePair here used as threadIdx.x -> gain_value using ArgMaxT = cub::KeyValuePair; - using BlockScanT = - cub::BlockScan; + using BlockScanT = cub::BlockScan; using MaxReduceT = cub::BlockReduce; using SumReduceT = cub::BlockReduce; @@ -220,7 +216,7 @@ __global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_featur // Aligned && shared storage for best_split __shared__ cub::Uninitialized uninitialized_split; - DeviceSplitCandidate& best_split = uninitialized_split.Alias(); + DeviceSplitCandidate &best_split = uninitialized_split.Alias(); __shared__ TempStorage temp_storage; if (threadIdx.x == 0) { @@ -237,10 +233,12 @@ __global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_featur int fidx = inputs.feature_set[blockIdx.x % number_active_features]; if (common::IsCat(shared_inputs.feature_types, fidx)) { - auto n_bins_in_feat = shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx]; + auto n_bins_in_feat = + shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx]; if (common::UseOneHot(n_bins_in_feat, shared_inputs.param.max_cat_to_onehot)) { EvaluateFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + kOneHot>(fidx, inputs, shared_inputs, evaluator, sorted_idx, 0, &best_split, + &temp_storage); } else { auto total_bins = shared_inputs.feature_values.size(); size_t offset = total_bins * input_idx; @@ -251,7 +249,8 @@ __global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_featur } } else { EvaluateFeature(fidx, inputs,shared_inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage); + kNum>(fidx, inputs, shared_inputs, evaluator, sorted_idx, 0, &best_split, + &temp_storage); } cub::CTA_SYNC(); @@ -261,8 +260,8 @@ __global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel(bst_featur } } -__device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a, - const DeviceSplitCandidate& b) { +__device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate &a, + const DeviceSplitCandidate &b) { return b.loss_chg > a.loss_chg ? b : a; } @@ -270,9 +269,10 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a, * \brief Set the bits for categorical splits based on the split threshold. */ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inputs, - common::Span d_sorted_idx, bst_feature_t fidx, - std::size_t input_idx, common::Span out, - DeviceSplitCandidate *p_out_split) { + common::Span d_sorted_idx, + bst_feature_t fidx, std::size_t input_idx, + common::Span out, + DeviceSplitCandidate *p_out_split) { auto &out_split = *p_out_split; out_split.split_cats = common::CatBitField{out}; @@ -282,12 +282,12 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu return; } - auto node_sorted_idx = - d_sorted_idx.subspan(shared_inputs.feature_values.size()*input_idx, shared_inputs.feature_values.size()); - size_t node_offset = input_idx* shared_inputs.feature_values.size(); + auto node_sorted_idx = d_sorted_idx.subspan(shared_inputs.feature_values.size() * input_idx, + shared_inputs.feature_values.size()); + size_t node_offset = input_idx * shared_inputs.feature_values.size(); auto best_thresh = out_split.PopBestThresh(); - auto f_sorted_idx = - node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], shared_inputs.FeatureBins(fidx)); + auto f_sorted_idx = node_sorted_idx.subspan(shared_inputs.feature_segments[fidx], + shared_inputs.FeatureBins(fidx)); if (out_split.dir != kLeftDir) { // forward, missing on right auto beg = dh::tcbegin(f_sorted_idx); @@ -311,28 +311,29 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu } template -void GPUHistEvaluator::LaunchEvaluateSplits(bst_feature_t number_active_features, - common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, +void GPUHistEvaluator::LaunchEvaluateSplits( + bst_feature_t number_active_features, common::Span d_inputs, + EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, common::Span out_splits) { if (need_sort_histogram_) { this->SortHistogram(d_inputs, shared_inputs, evaluator); } - size_t combined_num_features = number_active_features*d_inputs.size(); + size_t combined_num_features = number_active_features * d_inputs.size(); dh::TemporaryArray feature_best_splits(combined_num_features); // One block for each feature uint32_t constexpr kBlockThreads = 32; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, number_active_features,d_inputs, shared_inputs, this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()), + EvaluateSplitsKernel, number_active_features, d_inputs, + shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); // Reduce to get best candidate for left and right child over all features - auto reduce_offset = dh::MakeTransformIterator(thrust::make_counting_iterator(0llu), - [=] __device__(size_t idx) -> size_t { - return idx*number_active_features; - }); + auto reduce_offset = dh::MakeTransformIterator( + thrust::make_counting_iterator(0llu), + [=] __device__(size_t idx) -> size_t { return idx * number_active_features; }); size_t temp_storage_bytes = 0; auto num_segments = out_splits.size(); cub::DeviceSegmentedReduce::Sum(nullptr, temp_storage_bytes, feature_best_splits.data(), @@ -360,35 +361,40 @@ void GPUHistEvaluator::CopyToHost(const std::vector &n } template -void GPUHistEvaluator::EvaluateSplits(const std::vector &nidx, bst_feature_t number_active_features,common::Span d_inputs, - EvaluateSplitSharedInputs shared_inputs, - common::Span out_entries) { +void GPUHistEvaluator::EvaluateSplits( + const std::vector &nidx, bst_feature_t number_active_features, + common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, + common::Span out_entries) { auto evaluator = this->tree_evaluator_.template GetEvaluator(); dh::TemporaryArray splits_out_storage(d_inputs.size()); auto out_splits = dh::ToSpan(splits_out_storage); - this->LaunchEvaluateSplits(number_active_features,d_inputs,shared_inputs,evaluator, out_splits); + this->LaunchEvaluateSplits(number_active_features, d_inputs, shared_inputs, evaluator, + out_splits); - auto d_sorted_idx = this->SortedIdx(d_inputs.size(),shared_inputs.feature_values.size()); + auto d_sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); auto d_entries = out_entries; auto device_cats_accessor = this->DeviceCatStorage(nidx); // turn candidate into entry, along with handling sort based split. - dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable{ + dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable { auto const input = d_inputs[i]; auto &split = out_splits[i]; auto fidx = out_splits[i].findex; if (split.is_cat) { - SetCategoricalSplit( shared_inputs,d_sorted_idx, fidx, i, device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]); + SetCategoricalSplit(shared_inputs, d_sorted_idx, fidx, i, + device_cats_accessor.GetNodeCatStorage(input.nidx), &out_splits[i]); } - float base_weight = - evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.left_sum + split.right_sum}); - float left_weight = evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.left_sum}); - float right_weight = evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.right_sum}); + float base_weight = evaluator.CalcWeight(input.nidx, shared_inputs.param, + GradStats{split.left_sum + split.right_sum}); + float left_weight = + evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.left_sum}); + float right_weight = + evaluator.CalcWeight(input.nidx, shared_inputs.param, GradStats{split.right_sum}); - d_entries[i] = GPUExpandEntry{input.nidx, input.depth, out_splits[i], - base_weight, left_weight, right_weight}; + d_entries[i] = GPUExpandEntry{input.nidx, input.depth, out_splits[i], + base_weight, left_weight, right_weight}; }); this->CopyToHost(nidx); @@ -396,15 +402,15 @@ void GPUHistEvaluator::EvaluateSplits(const std::vector GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( - EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs,float weight) { + EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs, float weight) { dh::device_vector inputs = std::vector{input}; dh::TemporaryArray out_entries(1); - this->EvaluateSplits({input.nidx},input.feature_set.size(),dh::ToSpan(inputs),shared_inputs,dh::ToSpan(out_entries)); + this->EvaluateSplits({input.nidx}, input.feature_set.size(), dh::ToSpan(inputs), shared_inputs, + dh::ToSpan(out_entries)); GPUExpandEntry root_entry; - dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), - sizeof(GPUExpandEntry), cudaMemcpyDeviceToHost)); + dh::safe_cuda(cudaMemcpyAsync(&root_entry, out_entries.data().get(), sizeof(GPUExpandEntry), + cudaMemcpyDeviceToHost)); return root_entry; - } template class GPUHistEvaluator; diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 703599d2b668..93d7215a2629 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -69,8 +69,8 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, } template -common::Span GPUHistEvaluator::SortHistogram(common::Span d_inputs, - EvaluateSplitSharedInputs shared_inputs, +common::Span GPUHistEvaluator::SortHistogram( + common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator) { dh::XGBCachingDeviceAllocator alloc; auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()); @@ -78,7 +78,7 @@ common::Span GPUHistEvaluator::SortHistogram( auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size()); auto it = thrust::make_counting_iterator(0u); auto d_feature_idx = dh::ToSpan(feature_idx_); - auto total_bins = shared_inputs.feature_values.size(); + auto total_bins = shared_inputs.feature_values.size(); thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data), [=] XGBOOST_DEVICE(uint32_t i) { auto const &input = d_inputs[i / total_bins]; From 57340099eec10e8e20e53be65f3c5d11b79fe9b8 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 4 Jul 2022 05:07:15 -0700 Subject: [PATCH 16/17] Clang-tidy. --- src/tree/gpu_hist/evaluate_splits.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index f87064ccf079..aebaa89b2e7b 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -43,11 +43,11 @@ struct EvaluateSplitSharedInputs { // Used to return internal storage regions for categoricals // Usable on device struct CatAccessor { - common::Span cat_storage_; - std::size_t node_categorical_storage_size_; + common::Span cat_storage; + std::size_t node_categorical_storage_size; XGBOOST_DEVICE common::Span GetNodeCatStorage(bst_node_t nidx) { - return this->cat_storage_.subspan(nidx * this->node_categorical_storage_size_, - this->node_categorical_storage_size_); + return this->cat_storage.subspan(nidx * this->node_categorical_storage_size, + this->node_categorical_storage_size); } }; From 65e39100cd10925843ce5dfd584b721429c224c9 Mon Sep 17 00:00:00 2001 From: Rory Mitchell Date: Mon, 4 Jul 2022 06:13:30 -0700 Subject: [PATCH 17/17] Fix dask test. --- tests/python-gpu/test_gpu_with_dask.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index 6edcbd2f5b47..8a947312eacc 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -285,13 +285,15 @@ def test_early_stopping(self, local_cuda_cluster: LocalCUDACluster) -> None: 'booster'] assert hasattr(booster, 'best_score') dump = booster.get_dump(dump_format='json') + print(booster.best_iteration) assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 valid_X = X valid_y = y cls = dxgb.DaskXGBClassifier(objective='binary:logistic', - tree_method='gpu_hist', - n_estimators=100) + tree_method='gpu_hist', + eval_metric='error', + n_estimators=100) cls.client = client cls.fit(X, y, early_stopping_rounds=early_stopping_rounds, eval_set=[(valid_X, valid_y)])