Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small refactor to categoricals #7858

Merged
merged 5 commits into from May 5, 2022
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/common/categorical.h
Expand Up @@ -82,8 +82,8 @@ inline void InvalidCategory() {
/*!
* \brief Whether should we use onehot encoding for categorical data.
*/
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot, ObjInfo task) {
bool use_one_hot = n_cats < max_cat_to_onehot || task.UseOneHot();
XGBOOST_DEVICE inline bool UseOneHot(uint32_t n_cats, uint32_t max_cat_to_onehot) {
bool use_one_hot = n_cats < max_cat_to_onehot;
return use_one_hot;
}

Expand Down
42 changes: 23 additions & 19 deletions src/tree/gpu_hist/evaluate_splits.cu
Expand Up @@ -241,7 +241,7 @@ __global__ void EvaluateSplitsKernel(

if (common::IsCat(inputs.feature_types, fidx)) {
auto n_bins_in_feat = inputs.feature_segments[fidx + 1] - inputs.feature_segments[fidx];
if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot, task)) {
if (common::UseOneHot(n_bins_in_feat, inputs.param.max_cat_to_onehot)) {
EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT, TempStorage, GradientSumT,
kOneHot>(fidx, inputs, evaluator, sorted_idx, 0, &best_split, &temp_storage);
} else {
Expand Down Expand Up @@ -273,12 +273,19 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a,
* \brief Set the bits for categorical splits based on the split threshold.
*/
template <typename GradientSumT>
__device__ void SortBasedSplit(EvaluateSplitInputs<GradientSumT> const &input,
__device__ void SetCategoricalSplit(EvaluateSplitInputs<GradientSumT> const &input,
common::Span<bst_feature_t const> d_sorted_idx, bst_feature_t fidx,
bool is_left, common::Span<common::CatBitField::value_type> out,
DeviceSplitCandidate *p_out_split) {
DeviceSplitCandidate *p_out_split, ObjInfo task) {
auto &out_split = *p_out_split;
out_split.split_cats = common::CatBitField{out};

// Simple case for one hot split
if (common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot)) {
out_split.split_cats.Set(common::AsCat(out_split.fvalue));
return;
}

auto node_sorted_idx =
is_left ? d_sorted_idx.subspan(0, input.feature_values.size())
: d_sorted_idx.subspan(input.feature_values.size(), input.feature_values.size());
Expand Down Expand Up @@ -313,7 +320,7 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
EvaluateSplitInputs<GradientSumT> left, EvaluateSplitInputs<GradientSumT> right, ObjInfo task,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_splits) {
if (!split_cats_.empty()) {
if (need_sort_histogram_) {
this->SortHistogram(left, right, evaluator);
}

Expand Down Expand Up @@ -354,14 +361,13 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(
template <typename GradientSumT>
void GPUHistEvaluator<GradientSumT>::CopyToHost(EvaluateSplitInputs<GradientSumT> const &input,
common::Span<CatST> cats_out) {
if (has_sort_) {
dh::CUDAEvent event;
event.Record(dh::DefaultStream());
auto h_cats = this->HostCatStorage(input.nidx);
copy_stream_.View().Wait(event);
dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
cudaMemcpyDeviceToHost, copy_stream_.View()));
}
if (cats_out.empty()) return;
dh::CUDAEvent event;
event.Record(dh::DefaultStream());
auto h_cats = this->HostCatStorage(input.nidx);
copy_stream_.View().Wait(event);
dh::safe_cuda(cudaMemcpyAsync(h_cats.data(), cats_out.data(), cats_out.size_bytes(),
cudaMemcpyDeviceToHost, copy_stream_.View()));
}

template <typename GradientSumT>
Expand All @@ -378,17 +384,16 @@ void GPUHistEvaluator<GradientSumT>::EvaluateSplits(GPUExpandEntry candidate, Ob
auto d_sorted_idx = this->SortedIdx(left);
auto d_entries = out_entries;
auto cats_out = this->DeviceCatStorage(left.nidx);
// turn candidate into entry, along with hanlding sort based split.
// turn candidate into entry, along with handling sort based split.
dh::LaunchN(right.feature_set.empty() ? 1 : 2, [=] __device__(size_t i) {
auto const &input = i == 0 ? left : right;
auto &split = out_splits[i];
auto fidx = out_splits[i].findex;

if (split.is_cat &&
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
if (split.is_cat) {
bool is_left = i == 0;
auto out = is_left ? cats_out.first(cats_out.size() / 2) : cats_out.last(cats_out.size() / 2);
SortBasedSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i]);
SetCategoricalSplit(input, d_sorted_idx, fidx, is_left, out, &out_splits[i], task);
}

float base_weight =
Expand Down Expand Up @@ -420,9 +425,8 @@ GPUExpandEntry GPUHistEvaluator<GradientSumT>::EvaluateSingleSplit(
auto &split = out_split[i];
auto fidx = out_split[i].findex;

if (split.is_cat &&
!common::UseOneHot(input.FeatureBins(fidx), input.param.max_cat_to_onehot, task)) {
SortBasedSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i]);
if (split.is_cat) {
SetCategoricalSplit(input, d_sorted_idx, fidx, true, cats_out, &out_split[i], task);
}

float left_weight = evaluator.CalcWeight(0, input.param, GradStats{split.left_sum});
Expand Down
34 changes: 22 additions & 12 deletions src/tree/gpu_hist/evaluate_splits.cuh
Expand Up @@ -58,9 +58,12 @@ class GPUHistEvaluator {
dh::device_vector<bst_feature_t> feature_idx_;
// Training param used for evaluation
TrainParam param_;
// whether the input data requires sort based split, which is more complicated so we try
// to avoid it if possible.
bool has_sort_{false};
// Do we have any categorical features that require sorting histograms?
// use this to skip the expensive sort step
bool need_sort_histogram_ = false;
// Number of elements of categorical storage type
// needed to hold categoricals for a single mode
std::size_t node_categorical_storage_size_ = 0;

// Copy the categories from device to host asynchronously.
void CopyToHost(EvaluateSplitInputs<GradientSumT> const &input, common::Span<CatST> cats_out);
Expand All @@ -69,25 +72,33 @@ class GPUHistEvaluator {
* \brief Get host category storage of nidx for internal calculation.
*/
auto HostCatStorage(bst_node_t nidx) {
auto cat_bits = h_split_cats_.size() / param_.MaxNodes();

std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
if(h_split_cats_.size()<min_size){
h_split_cats_.resize(min_size);
}

if (nidx == RegTree::kRoot) {
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits * 2);
auto cats_out = common::Span<CatST>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
return cats_out;
}

/**
* \brief Get device category storage of nidx for internal calculation.
*/
auto DeviceCatStorage(bst_node_t nidx) {
auto cat_bits = split_cats_.size() / param_.MaxNodes();
std::size_t min_size=(nidx+2)*node_categorical_storage_size_;
if(split_cats_.size()<min_size){
split_cats_.resize(min_size);
}
if (nidx == RegTree::kRoot) {
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits);
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * cat_bits, cat_bits * 2);
auto cats_out = dh::ToSpan(split_cats_).subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_ * 2);
return cats_out;
}

Expand All @@ -114,7 +125,7 @@ class GPUHistEvaluator {
/**
* \brief Reset the evaluator, should be called before any use.
*/
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft, ObjInfo task,
void Reset(common::HistogramCuts const &cuts, common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param, int32_t device);

/**
Expand All @@ -123,8 +134,7 @@ class GPUHistEvaluator {
*/
common::Span<CatST const> GetHostNodeCats(bst_node_t nidx) const {
copy_stream_.View().Sync();
auto cat_bits = h_split_cats_.size() / param_.MaxNodes();
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * cat_bits, cat_bits);
auto cats_out = common::Span<CatST const>{h_split_cats_}.subspan(nidx * node_categorical_storage_size_, node_categorical_storage_size_);
return cats_out;
}
/**
Expand Down
77 changes: 38 additions & 39 deletions src/tree/gpu_hist/evaluator.cu
Expand Up @@ -16,12 +16,12 @@ namespace xgboost {
namespace tree {
template <typename GradientSumT>
void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
common::Span<FeatureType const> ft, ObjInfo task,
common::Span<FeatureType const> ft,
bst_feature_t n_features, TrainParam const &param,
int32_t device) {
param_ = param;
tree_evaluator_ = TreeEvaluator{param, n_features, device};
if (cuts.HasCategorical() && !task.UseOneHot()) {
if (cuts.HasCategorical()) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
auto beg = thrust::make_counting_iterator<size_t>(1ul);
Expand All @@ -30,46 +30,45 @@ void GPUHistEvaluator<GradientSumT>::Reset(common::HistogramCuts const &cuts,
// This condition avoids sort-based split function calls if the users want
// onehot-encoding-based splits.
// For some reason, any_of adds 1.5 minutes to compilation time for CUDA 11.x.
has_sort_ = thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
auto idx = i - 1;
if (common::IsCat(ft, idx)) {
auto n_bins = ptrs[i] - ptrs[idx];
bool use_sort = !common::UseOneHot(n_bins, to_onehot, task);
return use_sort;
}
return false;
});
need_sort_histogram_ =
thrust::any_of(thrust::cuda::par(alloc), beg, end, [=] XGBOOST_DEVICE(size_t i) {
auto idx = i - 1;
if (common::IsCat(ft, idx)) {
auto n_bins = ptrs[i] - ptrs[idx];
bool use_sort = !common::UseOneHot(n_bins, to_onehot);
return use_sort;
}
return false;
});

if (has_sort_) {
auto bit_storage_size = common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
CHECK_NE(bit_storage_size, 0);
// We need to allocate for all nodes since the updater can grow the tree layer by
// layer, all nodes in the same layer must be preserved until that layer is
// finished. We can allocate one layer at a time, but the best case is reducing the
// size of the bitset by about a half, at the cost of invoking CUDA malloc many more
// times than necessary.
split_cats_.resize(param.MaxNodes() * bit_storage_size);
h_split_cats_.resize(split_cats_.size());
dh::safe_cuda(
cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));
node_categorical_storage_size_ =
common::CatBitField::ComputeStorageSize(cuts.MaxCategory() + 1);
CHECK_NE(node_categorical_storage_size_, 0);
// We need to allocate for all nodes since the updater can grow the tree layer by
RAMitchell marked this conversation as resolved.
Show resolved Hide resolved
// layer, all nodes in the same layer must be preserved until that layer is
// finished. We can allocate one layer at a time, but the best case is reducing the
// size of the bitset by about a half, at the cost of invoking CUDA malloc many more
// times than necessary.
split_cats_.resize(node_categorical_storage_size_);
h_split_cats_.resize(node_categorical_storage_size_);
dh::safe_cuda(
cudaMemsetAsync(split_cats_.data().get(), '\0', split_cats_.size() * sizeof(CatST)));

cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time.
sort_input_.resize(cat_sorted_idx_.size());
cat_sorted_idx_.resize(cuts.cut_values_.Size() * 2); // evaluate 2 nodes at a time.
sort_input_.resize(cat_sorted_idx_.size());

/**
* cache feature index binary search result
*/
feature_idx_.resize(cat_sorted_idx_.size());
auto d_fidxes = dh::ToSpan(feature_idx_);
auto it = thrust::make_counting_iterator(0ul);
auto values = cuts.cut_values_.ConstDeviceSpan();
auto ptrs = cuts.cut_ptrs_.ConstDeviceSpan();
thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(),
feature_idx_.begin(), [=] XGBOOST_DEVICE(size_t i) {
auto fidx = dh::SegmentId(ptrs, i);
return fidx;
});
}
/**
* cache feature index binary search result
*/
feature_idx_.resize(cat_sorted_idx_.size());
auto d_fidxes = dh::ToSpan(feature_idx_);
auto it = thrust::make_counting_iterator(0ul);
auto values = cuts.cut_values_.ConstDeviceSpan();
thrust::transform(thrust::cuda::par(alloc), it, it + feature_idx_.size(), feature_idx_.begin(),
[=] XGBOOST_DEVICE(size_t i) {
auto fidx = dh::SegmentId(ptrs, i);
return fidx;
});
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/tree/hist/evaluate_splits.h
Expand Up @@ -244,7 +244,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
}
if (is_cat) {
auto n_bins = cut_ptrs.at(fidx + 1) - cut_ptrs[fidx];
if (common::UseOneHot(n_bins, param_.max_cat_to_onehot, task_)) {
if (common::UseOneHot(n_bins, param_.max_cat_to_onehot)) {
EnumerateSplit<+1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
EnumerateSplit<-1, kOneHot>(cut, {}, histogram, fidx, nidx, evaluator, best);
} else {
Expand Down
43 changes: 13 additions & 30 deletions src/tree/updater_gpu_hist.cu
Expand Up @@ -197,8 +197,6 @@ struct GPUHistMakerDevice {
std::unique_ptr<GradientBasedSampler> sampler;

std::unique_ptr<FeatureGroups> feature_groups;
// Storing split categories for last node.
dh::caching_device_vector<uint32_t> node_categories;

GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
Expand Down Expand Up @@ -243,7 +241,7 @@ struct GPUHistMakerDevice {
param.colsample_bytree);
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));

this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param,
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
ctx_->gpu_id);

this->interaction_constraints.Reset();
Expand Down Expand Up @@ -354,14 +352,14 @@ struct GPUHistMakerDevice {
return hist.HistogramExists(nidx_histogram) && hist.HistogramExists(nidx_parent);
}

void UpdatePosition(int nidx, RegTree* p_tree) {
RegTree::Node split_node = (*p_tree)[nidx];
auto split_type = p_tree->NodeSplitType(nidx);
void UpdatePosition(const GPUExpandEntry &e, RegTree* p_tree) {
RegTree::Node split_node = (*p_tree)[e.nid];
auto split_type = p_tree->NodeSplitType(e.nid);
auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
auto node_cats = dh::ToSpan(node_categories);
auto node_cats = e.split.split_cats.Bits();

row_partitioner->UpdatePosition(
nidx, split_node.LeftChild(), split_node.RightChild(),
e.nid, split_node.LeftChild(), split_node.RightChild(),
[=] __device__(bst_uint ridx) {
// given a row index, returns the node id it belongs to
bst_float cut_value =
Expand Down Expand Up @@ -567,27 +565,12 @@ struct GPUHistMakerDevice {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large.";
std::vector<uint32_t> split_cats;
if (candidate.split.split_cats.Bits().empty()) {
if (common::InvalidCat(candidate.split.fvalue)) {
common::InvalidCategory();
}
auto cat = common::AsCat(candidate.split.fvalue);
split_cats.resize(LBitField32::ComputeStorageSize(cat + 1), 0);
common::CatBitField cats_bits(split_cats);
cats_bits.Set(cat);
dh::CopyToD(split_cats, &node_categories);
} else {
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
auto max_cat = candidate.split.MaxCat();
split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
CHECK_LE(split_cats.size(), h_cats.size());
std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());

node_categories.resize(candidate.split.split_cats.Bits().size());
dh::safe_cuda(cudaMemcpyAsync(
node_categories.data().get(), candidate.split.split_cats.Data(),
candidate.split.split_cats.Bits().size_bytes(), cudaMemcpyDeviceToDevice));
}
CHECK_GT(candidate.split.split_cats.Bits().size(), 0);
RAMitchell marked this conversation as resolved.
Show resolved Hide resolved
auto h_cats = this->evaluator_.GetHostNodeCats(candidate.nid);
auto max_cat = candidate.split.MaxCat();
split_cats.resize(common::CatBitField::ComputeStorageSize(max_cat + 1), 0);
CHECK_LE(split_cats.size(), h_cats.size());
std::copy(h_cats.data(), h_cats.data() + split_cats.size(), split_cats.data());

tree.ExpandCategorical(
candidate.nid, candidate.split.findex, split_cats, candidate.split.dir == kLeftDir,
Expand Down Expand Up @@ -674,7 +657,7 @@ struct GPUHistMakerDevice {
// Update position is only run when child is valid, instead of right after apply
// split (as in approx tree method). Hense we have the finalise position call
// in GPU Hist.
this->UpdatePosition(candidate.nid, p_tree);
this->UpdatePosition(candidate, p_tree);
monitor.Stop("UpdatePosition");

monitor.Start("BuildHist");
Expand Down