Skip to content

Commit

Permalink
Fuse split evaluation kernels (#8026)
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell committed Jul 5, 2022
1 parent ff1c559 commit 794cbaa
Show file tree
Hide file tree
Showing 6 changed files with 309 additions and 315 deletions.
251 changes: 112 additions & 139 deletions src/tree/gpu_hist/evaluate_splits.cu

Large diffs are not rendered by default.

88 changes: 47 additions & 41 deletions src/tree/gpu_hist/evaluate_splits.cuh
Expand Up @@ -17,24 +17,40 @@ class HistogramCuts;
}

namespace tree {
template <typename GradientSumT>

// Inputs specific to each node
struct EvaluateSplitInputs {
int nidx;
int depth;
GradientPairPrecise parent_sum;
GPUTrainingParam param;
common::Span<const bst_feature_t> feature_set;
common::Span<const GradientPairPrecise> gradient_histogram;
};

// Inputs necessary for all nodes
struct EvaluateSplitSharedInputs {
GPUTrainingParam param;
common::Span<FeatureType const> feature_types;
common::Span<const uint32_t> feature_segments;
common::Span<const float> feature_values;
common::Span<const float> min_fvalue;
common::Span<const GradientSumT> gradient_histogram;

XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
__device__ auto FeatureBins(bst_feature_t fidx) const {
return feature_segments[fidx + 1] - feature_segments[fidx];
}
};

// Used to return internal storage regions for categoricals
// Usable on device
struct CatAccessor {
common::Span<common::CatBitField::value_type> cat_storage;
std::size_t node_categorical_storage_size;
XGBOOST_DEVICE common::Span<common::CatBitField::value_type> GetNodeCatStorage(bst_node_t nidx) {
return this->cat_storage.subspan(nidx * this->node_categorical_storage_size,
this->node_categorical_storage_size);
}
};

template <typename GradientSumT>
class GPUHistEvaluator {
using CatST = common::CatBitField::value_type; // categorical storage type
Expand All @@ -61,61 +77,53 @@ class GPUHistEvaluator {
// Do we have any categorical features that require sorting histograms?
// use this to skip the expensive sort step
bool need_sort_histogram_ = false;
bool has_categoricals_ = false;
// Number of elements of categorical storage type
// needed to hold categoricals for a single mode
std::size_t node_categorical_storage_size_ = 0;

// Copy the categories from device to host asynchronously.
void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
void CopyToHost( const std::vector<bst_node_t>& nidx);

/**
* \brief Get host category storage of nidx for internal calculation.
*/
auto HostCatStorage(bst_node_t nidx) {

std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
if(h_split_cats_.size()<min_size){
auto HostCatStorage(const std::vector<bst_node_t> &nidx) {
if (!has_categoricals_) return CatAccessor{};
auto max_nidx = *std::max_element(nidx.begin(), nidx.end());
std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_;
if (h_split_cats_.size() < min_size) {
h_split_cats_.resize(min_size);
}

if (nidx == RegTree::kRoot) {
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
return cats_out;
return CatAccessor{{h_split_cats_.data(), h_split_cats_.size()},
node_categorical_storage_size_};
}

/**
* \brief Get device category storage of nidx for internal calculation.
*/
auto DeviceCatStorage(bst_node_t nidx) {
std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
if(split_cats_.size()<min_size){
auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
if (!has_categoricals_) return CatAccessor{};
auto max_nidx = *std::max_element(nidx.begin(), nidx.end());
std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_;
if (split_cats_.size() < min_size) {
split_cats_.resize(min_size);
}
if (nidx == RegTree::kRoot) {
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
return cats_out;
return CatAccessor{dh::ToSpan(split_cats_), node_categorical_storage_size_};
}

/**
* \brief Get sorted index storage based on the left node of inputs.
*/
auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
return dh::ToSpan(cat_sorted_idx_).first(left.feature_values.size());
}
auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
if(!need_sort_histogram_) return common::Span<bst_feature_t>();
cat_sorted_idx_.resize(num_nodes * total_bins);
return dh::ToSpan(cat_sorted_idx_);
}

auto SortInput(EvaluateSplitInputs<GradientSumT> left) {
if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
return dh::ToSpan(sort_input_).first(left.feature_values.size());
}
auto SortInput(int num_nodes, bst_feature_t total_bins) {
if(!need_sort_histogram_) return common::Span<SortPair>();
sort_input_.resize(num_nodes * total_bins);
return dh::ToSpan(sort_input_);
}

Expand Down Expand Up @@ -154,26 +162,24 @@ class GPUHistEvaluator {
/**
* \brief Sort the histogram based on output to obtain contiguous partitions.
*/
common::Span<bst_feature_t const> SortHistogram(
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
common::Span<bst_feature_t const> SortHistogram(common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);

// impl of evaluate splits, contains CUDA kernels so it's public
void EvaluateSplits(EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits);
/**
* \brief Evaluate splits for left and right nodes.
*/
void EvaluateSplits(GPUExpandEntry candidate,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_splits);
/**
* \brief Evaluate splits for root node.
*/
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs<GradientSumT> input, float weight);
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,EvaluateSplitSharedInputs shared_inputs, float weight);
};
} // namespace tree
} // namespace xgboost
Expand Down
37 changes: 21 additions & 16 deletions src/tree/gpu_hist/evaluator.cu
Expand Up @@ -21,6 +21,7 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
int32_t device) {
param_ = param;
tree_evaluator_ = TreeEvaluator{param, n_features, device};
has_categoricals_ = cuts.HasCategorical();
if (cuts.HasCategorical()) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
Expand Down Expand Up @@ -69,42 +70,46 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,

template <typename GradientSumT>
common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto sorted_idx = this->SortedIdx(left);
auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
dh::Iota(sorted_idx);
auto data = this->SortInput(left);
auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size());
auto it = thrust::make_counting_iterator(0u);
auto d_feature_idx = dh::ToSpan(feature_idx_);
auto total_bins = shared_inputs.feature_values.size();
thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data),
[=] XGBOOST_DEVICE(uint32_t i) {
auto is_left = i < left.feature_values.size();
auto const &input = is_left ? left : right;
auto j = i - (is_left ? 0 : input.feature_values.size());
auto const &input = d_inputs[i / total_bins];
auto j = i % total_bins;
auto fidx = d_feature_idx[j];
if (common::IsCat(input.feature_types, fidx)) {
auto lw = evaluator.CalcWeightCat(input.param, input.gradient_histogram[j]);
if (common::IsCat(shared_inputs.feature_types, fidx)) {
auto lw = evaluator.CalcWeightCat(shared_inputs.param,
input.gradient_histogram[j]);
return thrust::make_tuple(i, lw);
}
return thrust::make_tuple(i, 0.0);
});
// Sort an array segmented according to
// - nodes
// - features within each node
// - gradients within each feature
thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data),
dh::tbegin(sorted_idx),
[=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) {
auto li = thrust::get<0>(l);
auto ri = thrust::get<0>(r);

auto l_is_left = li < left.feature_values.size();
auto r_is_left = ri < left.feature_values.size();
auto l_node = li / total_bins;
auto r_node = ri / total_bins;

if (l_is_left != r_is_left) {
return l_is_left; // not the same node
if (l_node != r_node) {
return l_node < r_node; // not the same node
}

auto const &input = l_is_left ? left : right;
li -= (l_is_left ? 0 : input.feature_values.size());
ri -= (r_is_left ? 0 : input.feature_values.size());
li = li % total_bins;
ri = ri % total_bins;

auto lfidx = d_feature_idx[li];
auto rfidx = d_feature_idx[ri];
Expand All @@ -113,7 +118,7 @@ common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
return lfidx < rfidx; // not the same feature
}

if (common::IsCat(input.feature_types, lfidx)) {
if (common::IsCat(shared_inputs.feature_types, lfidx)) {
auto lw = thrust::get<1>(l);
auto rw = thrust::get<1>(r);
return lw < rw;
Expand Down
114 changes: 57 additions & 57 deletions src/tree/updater_gpu_hist.cu
Expand Up @@ -196,6 +196,7 @@ struct GPUHistMakerDevice {
HistRounding<GradientSumT> histogram_rounding;

dh::PinnedMemory pinned;
dh::PinnedMemory pinned2;

common::Monitor monitor;
common::ColumnSampler column_sampler;
Expand Down Expand Up @@ -279,58 +280,64 @@ struct GPUHistMakerDevice {
common::Span<bst_feature_t> feature_set =
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
EvaluateSplitInputs<GradientSumT> inputs{nidx,
root_sum,
gpu_param,
feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(nidx)};
auto split = this->evaluator_.EvaluateSingleSplit(inputs, weight);
EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
EvaluateSplitSharedInputs shared_inputs{
gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map,
matrix.min_fvalue,
};
auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs, weight);
return split;
}

void EvaluateLeftRightSplits(GPUExpandEntry candidate, int left_nidx, int right_nidx,
const RegTree& tree,
void EvaluateSplits(const std::vector<GPUExpandEntry>& candidates, const RegTree& tree,
common::Span<GPUExpandEntry> pinned_candidates_out) {
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
GPUTrainingParam gpu_param(param);
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> left_feature_set =
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
if (candidates.empty()) return;
dh::TemporaryArray<EvaluateSplitInputs> d_node_inputs(2 * candidates.size());
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
std::vector<bst_node_t> nidx(2 * candidates.size());
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);

EvaluateSplitInputs<GradientSumT> left{left_nidx,
candidate.split.left_sum,
gpu_param,
left_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(left_nidx)};
EvaluateSplitInputs<GradientSumT> right{right_nidx,
candidate.split.right_sum,
gpu_param,
right_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(right_nidx)};

dh::TemporaryArray<GPUExpandEntry> entries(2);
this->evaluator_.EvaluateSplits(candidate, left, right, dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(),
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
}
EvaluateSplitSharedInputs shared_inputs{
GPUTrainingParam(param), feature_types, matrix.feature_segments,
matrix.gidx_fvalue_map, matrix.min_fvalue,
};
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
for (int i = 0; i < candidates.size(); i++) {
auto candidate = candidates.at(i);
int left_nidx = tree[candidate.nid].LeftChild();
int right_nidx = tree[candidate.nid].RightChild();
nidx[i * 2] = left_nidx;
nidx[i * 2 + 1] = right_nidx;
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> left_feature_set =
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
left_feature_set, hist.GetNodeHistogram(left_nidx)};
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
right_feature_set, hist.GetNodeHistogram(right_nidx)};
}
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
for (auto input : h_node_inputs) {
CHECK_EQ(input.feature_set.size(), number_active_features)
<< "Current implementation assumes that the number of active features "
"(after sampling) in any node is the same";
}
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs),
cudaMemcpyDefault));

this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost));
dh::DefaultStream().Sync();
}

void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx);
Expand Down Expand Up @@ -697,16 +704,9 @@ struct GPUHistMakerDevice {
this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
monitor.Stop("BuildHist");

for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
auto candidate = filtered_expand_set.at(i);
int left_child_nidx = tree[candidate.nid].LeftChild();
int right_child_nidx = tree[candidate.nid].RightChild();

monitor.Start("EvaluateSplits");
this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree,
new_candidates.subspan(i * 2, 2));
monitor.Stop("EvaluateSplits");
}
monitor.Start("EvaluateSplits");
this->EvaluateSplits(filtered_expand_set, *p_tree, new_candidates);
monitor.Stop("EvaluateSplits");
dh::DefaultStream().Sync();
driver.Push(new_candidates.begin(), new_candidates.end());
expand_set = driver.Pop();
Expand Down

0 comments on commit 794cbaa

Please sign in to comment.