diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index ccec859a286c..f3d387983b61 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1949,7 +1949,7 @@ class LDGIterator { const T *ptr_; public: - explicit LDGIterator(const T *ptr) : ptr_(ptr) {} + XGBOOST_DEVICE explicit LDGIterator(const T *ptr) : ptr_(ptr) {} __device__ T operator[](std::size_t idx) const { DeviceWordT tmp[kNumWords]; static_assert(sizeof(tmp) == sizeof(T), "Expect sizes to be equal."); diff --git a/src/tree/gpu_hist/evaluate_splits.cu b/src/tree/gpu_hist/evaluate_splits.cu index c79fab0c2081..908e7b01972c 100644 --- a/src/tree/gpu_hist/evaluate_splits.cu +++ b/src/tree/gpu_hist/evaluate_splits.cu @@ -22,202 +22,199 @@ XGBOOST_DEVICE float LossChangeMissing(const GradientPairPrecise &scan, bst_feature_t fidx, TreeEvaluator::SplitEvaluator evaluator, bool &missing_left_out) { // NOLINT - float parent_gain = CalcGain(param, parent_sum); - float missing_left_gain = evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing), - GradStats(parent_sum - (scan + missing))); - float missing_right_gain = - evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan)); - - if (missing_left_gain > missing_right_gain) { - missing_left_out = true; - return missing_left_gain - parent_gain; - } else { - missing_left_out = false; - return missing_right_gain - parent_gain; - } + const auto left_sum = scan + missing; + float missing_left_gain = + evaluator.CalcSplitGain(param, nidx, fidx, left_sum, parent_sum - left_sum); + float missing_right_gain = evaluator.CalcSplitGain(param, nidx, fidx, scan, parent_sum - scan); + + missing_left_out = missing_left_gain > missing_right_gain; + return missing_left_out?missing_left_gain:missing_right_gain; } -/*! - * \brief - * - * \tparam ReduceT BlockReduce Type. - * \tparam TempStorage Cub Shared memory - * - * \param begin - * \param end - * \param temp_storage Shared memory for intermediate result. - */ -template -__device__ GradientSumT ReduceFeature(common::Span feature_histogram, - TempStorageT *temp_storage) { - __shared__ cub::Uninitialized uninitialized_sum; - GradientSumT &shared_sum = uninitialized_sum.Alias(); - - GradientSumT local_sum = GradientSumT(); - // For loop sums features into one block size - auto begin = feature_histogram.data(); - auto end = begin + feature_histogram.size(); - for (auto itr = begin; itr < end; itr += BLOCK_THREADS) { - bool thread_active = itr + threadIdx.x < end; - // Scan histogram - GradientSumT bin = thread_active ? *(itr + threadIdx.x) : GradientSumT(); - local_sum += bin; +// This kernel uses block_size == warp_size. This is an unusually small block size for a cuda kernel +// - normally a larger block size is preferred to increase the number of resident warps on each SM +// (occupancy). In the below case each thread has a very large amount of work per thread relative to +// typical cuda kernels. Thus the SM can be highly utilised by a small number of threads. It was +// discovered by experiments that a small block size here is significantly faster. Furthermore, +// using only a single warp, synchronisation barriers are eliminated and broadcasts can be performed +// using warp intrinsics instead of slower shared memory. +template +class EvaluateSplitAgent { + public: + using ArgMaxT = cub::KeyValuePair; + using BlockScanT = cub::BlockScan; + using MaxReduceT = + cub::WarpReduce; + using SumReduceT = cub::WarpReduce; + struct TempStorage { + typename BlockScanT::TempStorage scan; + typename MaxReduceT::TempStorage max_reduce; + typename SumReduceT::TempStorage sum_reduce; + }; + + const int fidx; + const int nidx; + const float min_fvalue; + const uint32_t gidx_begin; // beginning bin + const uint32_t gidx_end; // end bin for i^th feature + const dh::LDGIterator feature_values; + const GradientPairPrecise *node_histogram; + const GradientPairPrecise parent_sum; + const GradientPairPrecise missing; + const GPUTrainingParam ¶m; + const TreeEvaluator::SplitEvaluator &evaluator; + TempStorage *temp_storage; + SumCallbackOp prefix_op; + static float constexpr kNullGain = -std::numeric_limits::infinity(); + + __device__ EvaluateSplitAgent(TempStorage *temp_storage, int fidx, + const EvaluateSplitInputs &inputs, + const EvaluateSplitSharedInputs &shared_inputs, + const TreeEvaluator::SplitEvaluator &evaluator) + : temp_storage(temp_storage), + nidx(inputs.nidx), + fidx(fidx), + min_fvalue(__ldg(shared_inputs.min_fvalue.data() + fidx)), + gidx_begin(__ldg(shared_inputs.feature_segments.data() + fidx)), + gidx_end(__ldg(shared_inputs.feature_segments.data() + fidx + 1)), + feature_values(shared_inputs.feature_values.data()), + node_histogram(inputs.gradient_histogram.data()), + parent_sum(dh::LDGIterator(&inputs.parent_sum)[0]), + param(shared_inputs.param), + evaluator(evaluator), + missing(parent_sum - ReduceFeature()) { + static_assert(kBlockSize == 32, + "This kernel relies on the assumption block_size == warp_size"); } - local_sum = ReduceT(temp_storage->sum_reduce).Reduce(local_sum, cub::Sum()); - // Reduction result is stored in thread 0. - if (threadIdx.x == 0) { - shared_sum = local_sum; + __device__ GradientPairPrecise ReduceFeature() { + GradientPairPrecise local_sum; + for (int idx = gidx_begin + threadIdx.x; idx < gidx_end; idx += kBlockSize) { + local_sum += LoadGpair(node_histogram + idx); + } + local_sum = SumReduceT(temp_storage->sum_reduce).Sum(local_sum); + // Broadcast result from thread 0 + return {__shfl_sync(0xffffffff, local_sum.GetGrad(), 0), + __shfl_sync(0xffffffff, local_sum.GetHess(), 0)}; } - cub::CTA_SYNC(); - return shared_sum; -} -/*! \brief Find the thread with best gain. */ -template -__device__ void EvaluateFeature( - int fidx, const EvaluateSplitInputs &inputs, const EvaluateSplitSharedInputs &shared_inputs, - TreeEvaluator::SplitEvaluator evaluator, - common::Span sorted_idx, size_t offset, - DeviceSplitCandidate *best_split, // shared memory storing best split - TempStorageT *temp_storage // temp memory for cub operations -) { - // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin - uint32_t gidx_end = shared_inputs.feature_segments[fidx + 1]; // end bin for i^th feature - auto feature_hist = inputs.gradient_histogram.subspan(gidx_begin, gidx_end - gidx_begin); - - // Sum histogram bins for current feature - GradientSumT const feature_sum = - ReduceFeature(feature_hist, temp_storage); - - GradientPairPrecise const missing = inputs.parent_sum - GradientPairPrecise{feature_sum}; - float const null_gain = -std::numeric_limits::infinity(); - - SumCallbackOp prefix_op = SumCallbackOp(); - for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += BLOCK_THREADS) { - bool thread_active = (scan_begin + threadIdx.x) < gidx_end; - - auto calc_bin_value = [&]() { - GradientSumT bin; - switch (type) { - case kOneHot: { - auto rest = - thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); - bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT - break; - } - case kNum: { - bin = - thread_active ? inputs.gradient_histogram[scan_begin + threadIdx.x] : GradientSumT(); - ScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); - break; - } - case kPart: { - auto rest = thread_active - ? inputs.gradient_histogram[sorted_idx[scan_begin + threadIdx.x] - offset] - : GradientSumT(); - // No min value for cat feature, use inclusive scan. - ScanT(temp_storage->scan).InclusiveScan(rest, rest, cub::Sum(), prefix_op); - bin = GradientSumT{inputs.parent_sum - GradientPairPrecise{rest} - missing}; // NOLINT - break; - } + // Load using efficient 128 vector load instruction + __device__ __forceinline__ GradientPairPrecise LoadGpair(const GradientPairPrecise *ptr) { + static_assert(sizeof(GradientPairPrecise) == sizeof(float4), + "Vector type size does not match gradient pair size."); + float4 tmp = *reinterpret_cast(ptr); + return *reinterpret_cast(&tmp); + } + + __device__ __forceinline__ void Numerical(DeviceSplitCandidate *__restrict__ best_split) { + for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) { + bool thread_active = (scan_begin + threadIdx.x) < gidx_end; + GradientPairPrecise bin = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x) + : GradientPairPrecise(); + BlockScanT(temp_storage->scan).ExclusiveScan(bin, bin, cub::Sum(), prefix_op); + // Whether the gradient of missing values is put to the left side. + bool missing_left = true; + float gain = thread_active ? LossChangeMissing(bin, missing, parent_sum, param, nidx, fidx, + evaluator, missing_left) + : kNullGain; + + // Find thread with best gain + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + // This reduce result is only valid in thread 0 + // broadcast to the rest of the warp + auto best_thread = __shfl_sync(0xffffffff, best.key, 0); + + // Best thread updates the split + if (threadIdx.x == best_thread) { + // Use pointer from cut to indicate begin and end of bins for each feature. + int split_gidx = (scan_begin + threadIdx.x) - 1; + float fvalue = + split_gidx < static_cast(gidx_begin) ? min_fvalue : feature_values[split_gidx]; + GradientPairPrecise left = missing_left ? bin + missing : bin; + GradientPairPrecise right = parent_sum - left; + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, + false, param); } - return bin; - }; - auto bin = calc_bin_value(); - // Whether the gradient of missing values is put to the left side. - bool missing_left = true; - float gain = null_gain; - if (thread_active) { - gain = LossChangeMissing(GradientPairPrecise{bin}, missing, inputs.parent_sum, - shared_inputs.param, inputs.nidx, fidx, evaluator, missing_left); } + } - __syncthreads(); - - // Find thread with best gain - cub::KeyValuePair tuple(threadIdx.x, gain); - cub::KeyValuePair best = - MaxReduceT(temp_storage->max_reduce).Reduce(tuple, cub::ArgMax()); - - __shared__ cub::KeyValuePair block_max; - if (threadIdx.x == 0) { - block_max = best; + __device__ __forceinline__ void OneHot(DeviceSplitCandidate *__restrict__ best_split) { + for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) { + bool thread_active = (scan_begin + threadIdx.x) < gidx_end; + + auto rest = thread_active ? LoadGpair(node_histogram + scan_begin + threadIdx.x) + : GradientPairPrecise(); + GradientPairPrecise bin = parent_sum - rest - missing; + // Whether the gradient of missing values is put to the left side. + bool missing_left = true; + float gain = thread_active ? LossChangeMissing(bin, missing, parent_sum, param, nidx, fidx, + evaluator, missing_left) + : kNullGain; + + // Find thread with best gain + auto best = MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + // This reduce result is only valid in thread 0 + // broadcast to the rest of the warp + auto best_thread = __shfl_sync(0xffffffff, best.key, 0); + // Best thread updates the split + if (threadIdx.x == best_thread) { + int32_t split_gidx = (scan_begin + threadIdx.x); + float fvalue = feature_values[split_gidx]; + GradientPairPrecise left = + missing_left ? bin + missing : bin; + GradientPairPrecise right = parent_sum - left; + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, + true, param); + } } - - cub::CTA_SYNC(); - - // Best thread updates the split - if (threadIdx.x == block_max.key) { - switch (type) { - case kNum: { - // Use pointer from cut to indicate begin and end of bins for each feature. - uint32_t gidx_begin = shared_inputs.feature_segments[fidx]; // beginning bin - int split_gidx = (scan_begin + threadIdx.x) - 1; - float fvalue; - if (split_gidx < static_cast(gidx_begin)) { - fvalue = shared_inputs.min_fvalue[fidx]; - } else { - fvalue = shared_inputs.feature_values[split_gidx]; - } - GradientPairPrecise left = - missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; - GradientPairPrecise right = inputs.parent_sum - left; - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, - false, shared_inputs.param); - break; - } - case kOneHot: { - int32_t split_gidx = (scan_begin + threadIdx.x); - float fvalue = shared_inputs.feature_values[split_gidx]; - GradientPairPrecise left = - missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; - GradientPairPrecise right = inputs.parent_sum - left; - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, fvalue, fidx, left, right, - true, shared_inputs.param); - break; - } - case kPart: { - int32_t split_gidx = (scan_begin + threadIdx.x); - float fvalue = shared_inputs.feature_values[split_gidx]; - GradientPairPrecise left = - missing_left ? GradientPairPrecise{bin} + missing : GradientPairPrecise{bin}; - GradientPairPrecise right = inputs.parent_sum - left; - auto best_thresh = block_max.key; // index of best threshold inside a feature. - best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, - right, true, shared_inputs.param); - break; - } + } + __device__ __forceinline__ void Partition(DeviceSplitCandidate *__restrict__ best_split, + bst_feature_t * __restrict__ sorted_idx, + std::size_t offset) { + for (int scan_begin = gidx_begin; scan_begin < gidx_end; scan_begin += kBlockSize) { + bool thread_active = (scan_begin + threadIdx.x) < gidx_end; + + auto rest = thread_active + ? LoadGpair(node_histogram + sorted_idx[scan_begin + threadIdx.x] - offset) + : GradientPairPrecise(); + // No min value for cat feature, use inclusive scan. + BlockScanT(temp_storage->scan).InclusiveSum(rest, rest, prefix_op); + GradientPairPrecise bin = parent_sum - rest - missing; + + // Whether the gradient of missing values is put to the left side. + bool missing_left = true; + float gain = thread_active ? LossChangeMissing(bin, missing, parent_sum, param, nidx, fidx, + evaluator, missing_left) + : kNullGain; + + + // Find thread with best gain + auto best = + MaxReduceT(temp_storage->max_reduce).Reduce({threadIdx.x, gain}, cub::ArgMax()); + // This reduce result is only valid in thread 0 + // broadcast to the rest of the warp + auto best_thread = __shfl_sync(0xffffffff, best.key, 0); + // Best thread updates the split + if (threadIdx.x == best_thread) { + GradientPairPrecise left = missing_left ? bin + missing : bin; + GradientPairPrecise right = parent_sum - left; + auto best_thresh = + threadIdx.x + (scan_begin - gidx_begin); // index of best threshold inside a feature. + best_split->Update(gain, missing_left ? kLeftDir : kRightDir, best_thresh, fidx, left, + right, true, param); } } - cub::CTA_SYNC(); } -} +}; -template -__global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel( +template +__global__ __launch_bounds__(kBlockSize) void EvaluateSplitsKernel( bst_feature_t number_active_features, common::Span d_inputs, const EvaluateSplitSharedInputs shared_inputs, common::Span sorted_idx, - TreeEvaluator::SplitEvaluator evaluator, + const TreeEvaluator::SplitEvaluator evaluator, common::Span out_candidates) { - // KeyValuePair here used as threadIdx.x -> gain_value - using ArgMaxT = cub::KeyValuePair; - using BlockScanT = cub::BlockScan; - using MaxReduceT = cub::BlockReduce; - - using SumReduceT = cub::BlockReduce; - - union TempStorage { - typename BlockScanT::TempStorage scan; - typename MaxReduceT::TempStorage max_reduce; - typename SumReduceT::TempStorage sum_reduce; - }; - // Aligned && shared storage for best_split __shared__ cub::Uninitialized uninitialized_split; DeviceSplitCandidate &best_split = uninitialized_split.Alias(); - __shared__ TempStorage temp_storage; if (threadIdx.x == 0) { best_split = DeviceSplitCandidate(); @@ -232,25 +229,23 @@ __global__ __launch_bounds__(BLOCK_THREADS) void EvaluateSplitsKernel( int fidx = inputs.feature_set[blockIdx.x % number_active_features]; + using AgentT = EvaluateSplitAgent; + __shared__ typename AgentT::TempStorage temp_storage; + AgentT agent(&temp_storage, fidx, inputs, shared_inputs, evaluator); + if (common::IsCat(shared_inputs.feature_types, fidx)) { auto n_bins_in_feat = shared_inputs.feature_segments[fidx + 1] - shared_inputs.feature_segments[fidx]; if (common::UseOneHot(n_bins_in_feat, shared_inputs.param.max_cat_to_onehot)) { - EvaluateFeature(fidx, inputs, shared_inputs, evaluator, sorted_idx, 0, &best_split, - &temp_storage); + agent.OneHot(&best_split); } else { auto total_bins = shared_inputs.feature_values.size(); size_t offset = total_bins * input_idx; auto node_sorted_idx = sorted_idx.subspan(offset, total_bins); - EvaluateFeature(fidx, inputs, shared_inputs, evaluator, node_sorted_idx, offset, - &best_split, &temp_storage); + agent.Partition(&best_split, node_sorted_idx.data(), offset); } } else { - EvaluateFeature(fidx, inputs, shared_inputs, evaluator, sorted_idx, 0, &best_split, - &temp_storage); + agent.Numerical(&best_split); } cub::CTA_SYNC(); @@ -310,8 +305,7 @@ __device__ void SetCategoricalSplit(const EvaluateSplitSharedInputs &shared_inpu } } -template -void GPUHistEvaluator::LaunchEvaluateSplits( +void GPUHistEvaluator::LaunchEvaluateSplits( bst_feature_t number_active_features, common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator, @@ -326,7 +320,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits( // One block for each feature uint32_t constexpr kBlockThreads = 32; dh::LaunchKernel {static_cast(combined_num_features), kBlockThreads, 0}( - EvaluateSplitsKernel, number_active_features, d_inputs, + EvaluateSplitsKernel, number_active_features, d_inputs, shared_inputs, this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size()), evaluator, dh::ToSpan(feature_best_splits)); @@ -345,8 +339,7 @@ void GPUHistEvaluator::LaunchEvaluateSplits( reduce_offset + 1); } -template -void GPUHistEvaluator::CopyToHost(const std::vector &nidx) { +void GPUHistEvaluator::CopyToHost(const std::vector &nidx) { if (!has_categoricals_) return; auto d_cats = this->DeviceCatStorage(nidx); auto h_cats = this->HostCatStorage(nidx); @@ -360,8 +353,7 @@ void GPUHistEvaluator::CopyToHost(const std::vector &n } } -template -void GPUHistEvaluator::EvaluateSplits( +void GPUHistEvaluator::EvaluateSplits( const std::vector &nidx, bst_feature_t number_active_features, common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, common::Span out_entries) { @@ -379,6 +371,10 @@ void GPUHistEvaluator::EvaluateSplits( dh::LaunchN(d_inputs.size(), [=] __device__(size_t i) mutable { auto const input = d_inputs[i]; auto &split = out_splits[i]; + // Subtract parent gain here + // As it is constant, this is more efficient than doing it during every split evaluation + float parent_gain = CalcGain(shared_inputs.param, input.parent_sum); + split.loss_chg -= parent_gain; auto fidx = out_splits[i].findex; if (split.is_cat) { @@ -400,8 +396,7 @@ void GPUHistEvaluator::EvaluateSplits( this->CopyToHost(nidx); } -template -GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( +GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( EvaluateSplitInputs input, EvaluateSplitSharedInputs shared_inputs) { dh::device_vector inputs = std::vector{input}; dh::TemporaryArray out_entries(1); @@ -413,6 +408,5 @@ GPUExpandEntry GPUHistEvaluator::EvaluateSingleSplit( return root_entry; } -template class GPUHistEvaluator; } // namespace tree } // namespace xgboost diff --git a/src/tree/gpu_hist/evaluate_splits.cuh b/src/tree/gpu_hist/evaluate_splits.cuh index a4c2e271fc8d..3e3e51b8e7c2 100644 --- a/src/tree/gpu_hist/evaluate_splits.cuh +++ b/src/tree/gpu_hist/evaluate_splits.cuh @@ -51,7 +51,6 @@ struct CatAccessor { } }; -template class GPUHistEvaluator { using CatST = common::CatBitField::value_type; // categorical storage type // use pinned memory to stage the categories, used for sort based splits. diff --git a/src/tree/gpu_hist/evaluator.cu b/src/tree/gpu_hist/evaluator.cu index 634734943aa0..707c64a29dcc 100644 --- a/src/tree/gpu_hist/evaluator.cu +++ b/src/tree/gpu_hist/evaluator.cu @@ -14,8 +14,7 @@ namespace xgboost { namespace tree { -template -void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, +void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, common::Span ft, bst_feature_t n_features, TrainParam const ¶m, int32_t device) { @@ -68,8 +67,7 @@ void GPUHistEvaluator::Reset(common::HistogramCuts const &cuts, } } -template -common::Span GPUHistEvaluator::SortHistogram( +common::Span GPUHistEvaluator::SortHistogram( common::Span d_inputs, EvaluateSplitSharedInputs shared_inputs, TreeEvaluator::SplitEvaluator evaluator) { dh::XGBCachingDeviceAllocator alloc; @@ -128,7 +126,5 @@ common::Span GPUHistEvaluator::SortHistogram( return dh::ToSpan(cat_sorted_idx_); } -template class GPUHistEvaluator; -template class GPUHistEvaluator; } // namespace tree } // namespace xgboost diff --git a/src/tree/param.h b/src/tree/param.h index c94437732299..7e32d1e10721 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -255,7 +255,7 @@ XGBOOST_DEVICE inline T CalcWeight(const TrainingParams &p, T sum_grad, // calculate the cost of loss function template XGBOOST_DEVICE inline T CalcGain(const TrainingParams &p, T sum_grad, T sum_hess) { - if (sum_hess < p.min_child_weight) { + if (sum_hess < p.min_child_weight || sum_hess <= 0.0) { return T(0.0); } if (p.max_delta_step == 0.0f) { diff --git a/src/tree/split_evaluator.h b/src/tree/split_evaluator.h index 30263a6eb22d..d19755d37bb1 100644 --- a/src/tree/split_evaluator.h +++ b/src/tree/split_evaluator.h @@ -71,11 +71,10 @@ class TreeEvaluator { const float* upper; bool has_constraint; - XGBOOST_DEVICE float CalcSplitGain(const ParamT ¶m, bst_node_t nidx, - bst_feature_t fidx, - tree::GradStats const& left, - tree::GradStats const& right) const { - int constraint = constraints[fidx]; + template + XGBOOST_DEVICE float CalcSplitGain(const ParamT& param, bst_node_t nidx, bst_feature_t fidx, + GradientSumT const& left, GradientSumT const& right) const { + int constraint = has_constraint ? constraints[fidx] : 0; const float negative_infinity = -std::numeric_limits::infinity(); float wleft = this->CalcWeight(nidx, param, left); float wright = this->CalcWeight(nidx, param, right); @@ -92,8 +91,9 @@ class TreeEvaluator { } } + template XGBOOST_DEVICE float CalcWeight(bst_node_t nodeid, const ParamT ¶m, - tree::GradStats const& stats) const { + GradientSumT const& stats) const { float w = ::xgboost::tree::CalcWeight(param, stats); if (!has_constraint) { return w; @@ -118,21 +118,32 @@ class TreeEvaluator { return ::xgboost::tree::CalcWeight(param, stats); } - XGBOOST_DEVICE float - CalcGainGivenWeight(ParamT const &p, tree::GradStats const& stats, float w) const { + // Fast floating point division instruction on device + XGBOOST_DEVICE float Divide(float a, float b) const { +#ifdef __CUDA_ARCH__ + return __fdividef(a, b); +#else + return a / b; +#endif + } + + template + XGBOOST_DEVICE float CalcGainGivenWeight(ParamT const& p, GradientSumT const& stats, + float w) const { if (stats.GetHess() <= 0) { return .0f; } // Avoiding tree::CalcGainGivenWeight can significantly reduce avg floating point error. if (p.max_delta_step == 0.0f && has_constraint == false) { - return common::Sqr(ThresholdL1(stats.sum_grad, p.reg_alpha)) / - (stats.sum_hess + p.reg_lambda); + return Divide(common::Sqr(ThresholdL1(stats.GetGrad(), p.reg_alpha)), + (stats.GetHess() + p.reg_lambda)); } - return tree::CalcGainGivenWeight(p, stats.sum_grad, - stats.sum_hess, w); + return tree::CalcGainGivenWeight(p, stats.GetGrad(), + stats.GetHess(), w); } + template XGBOOST_DEVICE float CalcGain(bst_node_t nid, ParamT const &p, - tree::GradStats const& stats) const { + GradientSumT const& stats) const { return this->CalcGainGivenWeight(p, stats, this->CalcWeight(nid, p, stats)); } }; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 79aad1708e1e..3f3137c58adc 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -171,7 +171,7 @@ class DeviceHistogramStorage { template struct GPUHistMakerDevice { private: - GPUHistEvaluator evaluator_; + GPUHistEvaluator evaluator_; Context const* ctx_; public: diff --git a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu index eec029c92537..28b69122deae 100644 --- a/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu +++ b/tests/cpp/tree/gpu_hist/test_evaluate_splits.cu @@ -62,7 +62,7 @@ void TestEvaluateSingleSplit(bool is_categorical) { cuts.min_vals_.ConstDeviceSpan(), }; - GPUHistEvaluator evaluator{ + GPUHistEvaluator evaluator{ tparam, static_cast(feature_set.size()), 0}; evaluator.Reset(cuts, dh::ToSpan(feature_types), feature_set.size(), tparam, 0); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; @@ -109,7 +109,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { dh::ToSpan(feature_min_values), }; - GPUHistEvaluator evaluator(tparam, feature_set.size(), 0); + GPUHistEvaluator evaluator(tparam, feature_set.size(), 0); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 0); @@ -121,7 +121,7 @@ TEST(GpuHist, EvaluateSingleSplitMissing) { TEST(GpuHist, EvaluateSingleSplitEmpty) { TrainParam tparam = ZeroParam(); - GPUHistEvaluator evaluator(tparam, 1, 0); + GPUHistEvaluator evaluator(tparam, 1, 0); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(EvaluateSplitInputs{}, EvaluateSplitSharedInputs{}).split; EXPECT_EQ(result.findex, -1); @@ -159,7 +159,7 @@ TEST(GpuHist, EvaluateSingleSplitFeatureSampling) { dh::ToSpan(feature_min_values), }; - GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); + GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input, shared_inputs).split; EXPECT_EQ(result.findex, 1); @@ -199,7 +199,7 @@ TEST(GpuHist, EvaluateSingleSplitBreakTies) { dh::ToSpan(feature_min_values), }; - GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); + GPUHistEvaluator evaluator(tparam, feature_min_values.size(), 0); DeviceSplitCandidate result = evaluator.EvaluateSingleSplit(input,shared_inputs).split; EXPECT_EQ(result.findex, 0); @@ -246,7 +246,7 @@ TEST(GpuHist, EvaluateSplits) { dh::ToSpan(feature_min_values), }; - GPUHistEvaluator evaluator{ + GPUHistEvaluator evaluator{ tparam, static_cast(feature_min_values.size()), 0}; dh::device_vector inputs = std::vector{input_left,input_right}; evaluator.LaunchEvaluateSplits(input_left.feature_set.size(),dh::ToSpan(inputs),shared_inputs, evaluator.GetEvaluator(), @@ -263,7 +263,7 @@ TEST(GpuHist, EvaluateSplits) { TEST_F(TestPartitionBasedSplit, GpuHist) { dh::device_vector ft{std::vector{FeatureType::kCategorical}}; - GPUHistEvaluator evaluator{param_, + GPUHistEvaluator evaluator{param_, static_cast(info_.num_col_), 0}; cuts_.cut_ptrs_.SetDevice(0); @@ -287,5 +287,6 @@ TEST_F(TestPartitionBasedSplit, GpuHist) { auto split = evaluator.EvaluateSingleSplit(input, shared_inputs).split; ASSERT_NEAR(split.loss_chg, best_score_, 1e-16); } + } // namespace tree } // namespace xgboost diff --git a/tests/cpp/tree/test_evaluate_splits.h b/tests/cpp/tree/test_evaluate_splits.h index bbd8b98eb0fc..7925e40cdb26 100644 --- a/tests/cpp/tree/test_evaluate_splits.h +++ b/tests/cpp/tree/test_evaluate_splits.h @@ -43,6 +43,8 @@ class TestPartitionBasedSplit : public ::testing::Test { auto &h_vals = cuts_.cut_values_.HostVector(); h_vals.resize(n_bins_); std::iota(h_vals.begin(), h_vals.end(), 0.0); + + cuts_.min_vals_.Resize(1); hist_.Init(cuts_.TotalBins()); hist_.AddHistRow(0);