Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fuse split evaluation kernels #8026

Merged
merged 18 commits into from Jul 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
251 changes: 112 additions & 139 deletions src/tree/gpu_hist/evaluate_splits.cu

Large diffs are not rendered by default.

88 changes: 47 additions & 41 deletions src/tree/gpu_hist/evaluate_splits.cuh
Expand Up @@ -17,24 +17,40 @@ class HistogramCuts;
}

namespace tree {
template <typename GradientSumT>

// Inputs specific to each node
struct EvaluateSplitInputs {
int nidx;
int depth;
GradientPairPrecise parent_sum;
GPUTrainingParam param;
common::Span<const bst_feature_t> feature_set;
common::Span<const GradientPairPrecise> gradient_histogram;
};

// Inputs necessary for all nodes
struct EvaluateSplitSharedInputs {
GPUTrainingParam param;
common::Span<FeatureType const> feature_types;
common::Span<const uint32_t> feature_segments;
common::Span<const float> feature_values;
common::Span<const float> min_fvalue;
common::Span<const GradientSumT> gradient_histogram;

XGBOOST_DEVICE auto Features() const { return feature_segments.size() - 1; }
__device__ auto FeatureBins(bst_feature_t fidx) const {
return feature_segments[fidx + 1] - feature_segments[fidx];
}
};

// Used to return internal storage regions for categoricals
// Usable on device
struct CatAccessor {
common::Span<common::CatBitField::value_type> cat_storage;
std::size_t node_categorical_storage_size;
XGBOOST_DEVICE common::Span<common::CatBitField::value_type> GetNodeCatStorage(bst_node_t nidx) {
return this->cat_storage.subspan(nidx * this->node_categorical_storage_size,
this->node_categorical_storage_size);
}
};

template <typename GradientSumT>
class GPUHistEvaluator {
using CatST = common::CatBitField::value_type; // categorical storage type
Expand All @@ -61,61 +77,53 @@ class GPUHistEvaluator {
// Do we have any categorical features that require sorting histograms?
// use this to skip the expensive sort step
bool need_sort_histogram_ = false;
bool has_categoricals_ = false;
// Number of elements of categorical storage type
// needed to hold categoricals for a single mode
std::size_t node_categorical_storage_size_ = 0;

// Copy the categories from device to host asynchronously.
void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
void CopyToHost( const std::vector<bst_node_t>& nidx);

/**
* \brief Get host category storage of nidx for internal calculation.
*/
auto HostCatStorage(bst_node_t nidx) {

std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
if(h_split_cats_.size()<min_size){
auto HostCatStorage(const std::vector<bst_node_t> &nidx) {
if (!has_categoricals_) return CatAccessor{};
auto max_nidx = *std::max_element(nidx.begin(), nidx.end());
std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_;
if (h_split_cats_.size() < min_size) {
h_split_cats_.resize(min_size);
}

if (nidx == RegTree::kRoot) {
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
return cats_out;
return CatAccessor{{h_split_cats_.data(), h_split_cats_.size()},
node_categorical_storage_size_};
}

/**
* \brief Get device category storage of nidx for internal calculation.
*/
auto DeviceCatStorage(bst_node_t nidx) {
std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
if(split_cats_.size()<min_size){
auto DeviceCatStorage(const std::vector<bst_node_t> &nidx) {
if (!has_categoricals_) return CatAccessor{};
auto max_nidx = *std::max_element(nidx.begin(), nidx.end());
std::size_t min_size = (max_nidx + 2) * node_categorical_storage_size_;
if (split_cats_.size() < min_size) {
split_cats_.resize(min_size);
}
if (nidx == RegTree::kRoot) {
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
return cats_out;
return CatAccessor{dh::ToSpan(split_cats_), node_categorical_storage_size_};
}

/**
* \brief Get sorted index storage based on the left node of inputs.
*/
auto SortedIdx(EvaluateSplitInputs<GradientSumT> left) {
if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
return dh::ToSpan(cat_sorted_idx_).first(left.feature_values.size());
}
auto SortedIdx(int num_nodes, bst_feature_t total_bins) {
if(!need_sort_histogram_) return common::Span<bst_feature_t>();
cat_sorted_idx_.resize(num_nodes * total_bins);
return dh::ToSpan(cat_sorted_idx_);
}

auto SortInput(EvaluateSplitInputs<GradientSumT> left) {
if (left.nidx == RegTree::kRoot && !cat_sorted_idx_.empty()) {
return dh::ToSpan(sort_input_).first(left.feature_values.size());
}
auto SortInput(int num_nodes, bst_feature_t total_bins) {
if(!need_sort_histogram_) return common::Span<SortPair>();
sort_input_.resize(num_nodes * total_bins);
return dh::ToSpan(sort_input_);
}

Expand Down Expand Up @@ -154,26 +162,24 @@ class GPUHistEvaluator {
/**
* \brief Sort the histogram based on output to obtain contiguous partitions.
*/
common::Span<bst_feature_t const> SortHistogram(
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
common::Span<bst_feature_t const> SortHistogram(common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator);

// impl of evaluate splits, contains CUDA kernels so it's public
void EvaluateSplits(EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
void LaunchEvaluateSplits(bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits);
/**
* \brief Evaluate splits for left and right nodes.
*/
void EvaluateSplits(GPUExpandEntry candidate,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
void EvaluateSplits(const std::vector<bst_node_t> &nidx,bst_feature_t number_active_features,common::Span<const EvaluateSplitInputs> d_inputs,
EvaluateSplitSharedInputs shared_inputs,
common::Span<GPUExpandEntry> out_splits);
/**
* \brief Evaluate splits for root node.
*/
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs<GradientSumT> input, float weight);
GPUExpandEntry EvaluateSingleSplit(EvaluateSplitInputs input,EvaluateSplitSharedInputs shared_inputs, float weight);
};
} // namespace tree
} // namespace xgboost
Expand Down
37 changes: 21 additions & 16 deletions src/tree/gpu_hist/evaluator.cu
Expand Up @@ -21,6 +21,7 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
int32_t device) {
param_ = param;
tree_evaluator_ = TreeEvaluator{param, n_features, device};
has_categoricals_ = cuts.HasCategorical();
if (cuts.HasCategorical()) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
Expand Down Expand Up @@ -69,42 +70,46 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,

template <typename GradientSumT>
common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
EvaluateSplitInputs<GradientSumT> const &left, EvaluateSplitInputs<GradientSumT> const &right,
common::Span<const EvaluateSplitInputs> d_inputs, EvaluateSplitSharedInputs shared_inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto sorted_idx = this->SortedIdx(left);
auto sorted_idx = this->SortedIdx(d_inputs.size(), shared_inputs.feature_values.size());
dh::Iota(sorted_idx);
auto data = this->SortInput(left);
auto data = this->SortInput(d_inputs.size(), shared_inputs.feature_values.size());
auto it = thrust::make_counting_iterator(0u);
auto d_feature_idx = dh::ToSpan(feature_idx_);
auto total_bins = shared_inputs.feature_values.size();
thrust::transform(thrust::cuda::par(alloc), it, it + data.size(), dh::tbegin(data),
[=] XGBOOST_DEVICE(uint32_t i) {
auto is_left = i < left.feature_values.size();
auto const &input = is_left ? left : right;
auto j = i - (is_left ? 0 : input.feature_values.size());
auto const &input = d_inputs[i / total_bins];
auto j = i % total_bins;
auto fidx = d_feature_idx[j];
if (common::IsCat(input.feature_types, fidx)) {
auto lw = evaluator.CalcWeightCat(input.param, input.gradient_histogram[j]);
if (common::IsCat(shared_inputs.feature_types, fidx)) {
auto lw = evaluator.CalcWeightCat(shared_inputs.param,
input.gradient_histogram[j]);
return thrust::make_tuple(i, lw);
}
return thrust::make_tuple(i, 0.0);
});
// Sort an array segmented according to
// - nodes
// - features within each node
// - gradients within each feature
thrust::stable_sort_by_key(thrust::cuda::par(alloc), dh::tbegin(data), dh::tend(data),
dh::tbegin(sorted_idx),
[=] XGBOOST_DEVICE(SortPair const &l, SortPair const &r) {
auto li = thrust::get<0>(l);
auto ri = thrust::get<0>(r);

auto l_is_left = li < left.feature_values.size();
auto r_is_left = ri < left.feature_values.size();
auto l_node = li / total_bins;
auto r_node = ri / total_bins;

if (l_is_left != r_is_left) {
return l_is_left; // not the same node
if (l_node != r_node) {
return l_node < r_node; // not the same node
}

auto const &input = l_is_left ? left : right;
li -= (l_is_left ? 0 : input.feature_values.size());
ri -= (r_is_left ? 0 : input.feature_values.size());
li = li % total_bins;
ri = ri % total_bins;

auto lfidx = d_feature_idx[li];
auto rfidx = d_feature_idx[ri];
Expand All @@ -113,7 +118,7 @@ common::Span<bst_feature_t const> GPUHistEvaluator<GradientSumT>::SortHistogram(
return lfidx < rfidx; // not the same feature
}

if (common::IsCat(input.feature_types, lfidx)) {
if (common::IsCat(shared_inputs.feature_types, lfidx)) {
auto lw = thrust::get<1>(l);
auto rw = thrust::get<1>(r);
return lw < rw;
Expand Down
114 changes: 57 additions & 57 deletions src/tree/updater_gpu_hist.cu
Expand Up @@ -196,6 +196,7 @@ struct GPUHistMakerDevice {
HistRounding<GradientSumT> histogram_rounding;

dh::PinnedMemory pinned;
dh::PinnedMemory pinned2;

common::Monitor monitor;
common::ColumnSampler column_sampler;
Expand Down Expand Up @@ -279,58 +280,64 @@ struct GPUHistMakerDevice {
common::Span<bst_feature_t> feature_set =
interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
EvaluateSplitInputs<GradientSumT> inputs{nidx,
root_sum,
gpu_param,
feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(nidx)};
auto split = this->evaluator_.EvaluateSingleSplit(inputs, weight);
EvaluateSplitInputs inputs{nidx, 0, root_sum, feature_set, hist.GetNodeHistogram(nidx)};
EvaluateSplitSharedInputs shared_inputs{
gpu_param, feature_types, matrix.feature_segments, matrix.gidx_fvalue_map,
matrix.min_fvalue,
};
auto split = this->evaluator_.EvaluateSingleSplit(inputs, shared_inputs, weight);
return split;
}

void EvaluateLeftRightSplits(GPUExpandEntry candidate, int left_nidx, int right_nidx,
const RegTree& tree,
void EvaluateSplits(const std::vector<GPUExpandEntry>& candidates, const RegTree& tree,
common::Span<GPUExpandEntry> pinned_candidates_out) {
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
GPUTrainingParam gpu_param(param);
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> left_feature_set =
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
if (candidates.empty()) return;
dh::TemporaryArray<EvaluateSplitInputs> d_node_inputs(2 * candidates.size());
dh::TemporaryArray<DeviceSplitCandidate> splits_out(2 * candidates.size());
std::vector<bst_node_t> nidx(2 * candidates.size());
auto h_node_inputs = pinned2.GetSpan<EvaluateSplitInputs>(2 * candidates.size());
auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);

EvaluateSplitInputs<GradientSumT> left{left_nidx,
candidate.split.left_sum,
gpu_param,
left_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(left_nidx)};
EvaluateSplitInputs<GradientSumT> right{right_nidx,
candidate.split.right_sum,
gpu_param,
right_feature_set,
feature_types,
matrix.feature_segments,
matrix.gidx_fvalue_map,
matrix.min_fvalue,
hist.GetNodeHistogram(right_nidx)};

dh::TemporaryArray<GPUExpandEntry> entries(2);
this->evaluator_.EvaluateSplits(candidate, left, right, dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(), entries.data().get(),
sizeof(GPUExpandEntry) * entries.size(), cudaMemcpyDeviceToHost));
}
EvaluateSplitSharedInputs shared_inputs{
GPUTrainingParam(param), feature_types, matrix.feature_segments,
matrix.gidx_fvalue_map, matrix.min_fvalue,
};
dh::TemporaryArray<GPUExpandEntry> entries(2 * candidates.size());
for (int i = 0; i < candidates.size(); i++) {
auto candidate = candidates.at(i);
int left_nidx = tree[candidate.nid].LeftChild();
int right_nidx = tree[candidate.nid].RightChild();
nidx[i * 2] = left_nidx;
nidx[i * 2 + 1] = right_nidx;
auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
left_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> left_feature_set =
interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
right_sampled_features->SetDevice(ctx_->gpu_id);
common::Span<bst_feature_t> right_feature_set =
interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
h_node_inputs[i * 2] = {left_nidx, candidate.depth + 1, candidate.split.left_sum,
left_feature_set, hist.GetNodeHistogram(left_nidx)};
h_node_inputs[i * 2 + 1] = {right_nidx, candidate.depth + 1, candidate.split.right_sum,
right_feature_set, hist.GetNodeHistogram(right_nidx)};
}
bst_feature_t number_active_features = h_node_inputs[0].feature_set.size();
for (auto input : h_node_inputs) {
CHECK_EQ(input.feature_set.size(), number_active_features)
<< "Current implementation assumes that the number of active features "
"(after sampling) in any node is the same";
}
dh::safe_cuda(cudaMemcpyAsync(d_node_inputs.data().get(), h_node_inputs.data(),
h_node_inputs.size() * sizeof(EvaluateSplitInputs),
cudaMemcpyDefault));

this->evaluator_.EvaluateSplits(nidx, number_active_features, dh::ToSpan(d_node_inputs),
shared_inputs, dh::ToSpan(entries));
dh::safe_cuda(cudaMemcpyAsync(pinned_candidates_out.data(),
entries.data().get(), sizeof(GPUExpandEntry) * entries.size(),
cudaMemcpyDeviceToHost));
dh::DefaultStream().Sync();
}

void BuildHist(int nidx) {
auto d_node_hist = hist.GetNodeHistogram(nidx);
Expand Down Expand Up @@ -697,16 +704,9 @@ struct GPUHistMakerDevice {
this->BuildHistLeftRight(filtered_expand_set, reducer, tree);
monitor.Stop("BuildHist");

for (auto i = 0ull; i < filtered_expand_set.size(); i++) {
auto candidate = filtered_expand_set.at(i);
int left_child_nidx = tree[candidate.nid].LeftChild();
int right_child_nidx = tree[candidate.nid].RightChild();

monitor.Start("EvaluateSplits");
this->EvaluateLeftRightSplits(candidate, left_child_nidx, right_child_nidx, *p_tree,
new_candidates.subspan(i * 2, 2));
monitor.Stop("EvaluateSplits");
}
monitor.Start("EvaluateSplits");
this->EvaluateSplits(filtered_expand_set, *p_tree, new_candidates);
monitor.Stop("EvaluateSplits");
dh::DefaultStream().Sync();
driver.Push(new_candidates.begin(), new_candidates.end());
expand_set = driver.Pop();
Expand Down