Skip to content

Commit

Permalink
Unify evaluation functions. (#6037)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Aug 26, 2020
1 parent 80c8547 commit 2fcc4f2
Show file tree
Hide file tree
Showing 29 changed files with 579 additions and 743 deletions.
1 change: 0 additions & 1 deletion amalgamation/xgboost-all0.cc
Expand Up @@ -48,7 +48,6 @@

// trees
#include "../src/tree/param.cc"
#include "../src/tree/split_evaluator.cc"
#include "../src/tree/tree_model.cc"
#include "../src/tree/tree_updater.cc"
#include "../src/tree/updater_colmaker.cc"
Expand Down
2 changes: 1 addition & 1 deletion include/xgboost/base.h
Expand Up @@ -242,7 +242,7 @@ class GradientPairInternal {

XGBOOST_DEVICE explicit GradientPairInternal(int value) {
*this = GradientPairInternal<T>(static_cast<float>(value),
static_cast<float>(value));
static_cast<float>(value));
}

friend std::ostream &operator<<(std::ostream &os,
Expand Down
4 changes: 2 additions & 2 deletions include/xgboost/data.h
Expand Up @@ -247,7 +247,7 @@ class SparsePage {
/*! \brief the data of the segments */
HostDeviceVector<Entry> data;

size_t base_rowid{};
size_t base_rowid {0};

/*! \brief an instance of sparse vector in the batch */
using Inst = common::Span<Entry const>;
Expand Down Expand Up @@ -548,7 +548,7 @@ class DMatrix {
int nthread,
int max_bin);

virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;
/*! \brief page size 32 MB */
static const size_t kPageSize = 32UL << 20UL;

Expand Down
7 changes: 3 additions & 4 deletions include/xgboost/span.h
Expand Up @@ -104,13 +104,12 @@ namespace common {
#if defined(__CUDA_ARCH__)
#define SPAN_LT(lhs, rhs) \
if (!((lhs) < (rhs))) { \
printf("%lu < %lu failed\n", static_cast<size_t>(lhs), \
static_cast<size_t>(rhs)); \
printf("[xgboost] Condition: %lu < %lu failed\n", \
static_cast<size_t>(lhs), static_cast<size_t>(rhs)); \
asm("trap;"); \
}
#else
#define SPAN_LT(lhs, rhs) \
SPAN_CHECK((lhs) < (rhs))
#define SPAN_LT(lhs, rhs) SPAN_CHECK((lhs) < (rhs))
#endif // defined(__CUDA_ARCH__)

namespace detail {
Expand Down
2 changes: 0 additions & 2 deletions src/common/hist_util.h
Expand Up @@ -659,8 +659,6 @@ class GHistBuilder {
/*! \brief number of all bins over all features */
uint32_t nbins_ { 0 };
};


} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_HIST_UTIL_H_
10 changes: 9 additions & 1 deletion src/common/observer.h
Expand Up @@ -6,6 +6,7 @@
#define XGBOOST_COMMON_OBSERVER_H_

#include <iostream>
#include <algorithm>
#include <limits>
#include <string>
#include <vector>
Expand Down Expand Up @@ -62,6 +63,13 @@ class TrainingObserver {
auto const& tree = *p_tree;
this->Observe(tree);
}
template <typename T>
void Observe(common::Span<T> span, std::string name,
size_t n = std::numeric_limits<std::size_t>::max()) {
std::vector<T> copy(span.size());
std::copy(span.cbegin(), span.cend(), copy.begin());
this->Observe(copy, name, n);
}
/*\brief Observe data hosted by `std::vector'. */
template <typename T>
void Observe(std::vector<T> const& h_vec, std::string name,
Expand All @@ -71,7 +79,7 @@ class TrainingObserver {

for (size_t i = 0; i < h_vec.size(); ++i) {
OBSERVER_PRINT << h_vec[i] << ", ";
if (i % 8 == 0) {
if (i % 8 == 0 && i != 0) {
OBSERVER_PRINT << OBSERVER_NEWLINE;
}
if ((i + 1) == n) {
Expand Down
4 changes: 2 additions & 2 deletions src/common/row_set.h
Expand Up @@ -24,13 +24,13 @@ class RowSetCollection {
struct Elem {
const size_t* begin{nullptr};
const size_t* end{nullptr};
int node_id{-1};
bst_node_t node_id{-1};
// id of node associated with this instance set; -1 means uninitialized
Elem()
= default;
Elem(const size_t* begin,
const size_t* end,
int node_id = -1)
bst_node_t node_id = -1)
: begin(begin), end(end), node_id(node_id) {}

inline size_t Size() const {
Expand Down
71 changes: 1 addition & 70 deletions src/tree/constraints.cuh
Expand Up @@ -12,81 +12,12 @@
#include <vector>

#include "param.h"
#include "constraints.h"
#include "xgboost/span.h"
#include "../common/bitfield.h"
#include "../common/device_helpers.cuh"

namespace xgboost {

// This class implements monotonic constraints, L1, L2 regularization.
struct ValueConstraint {
double lower_bound;
double upper_bound;
XGBOOST_DEVICE ValueConstraint()
: lower_bound(-std::numeric_limits<double>::max()),
upper_bound(std::numeric_limits<double>::max()) {}
inline static void Init(tree::TrainParam *param, unsigned num_feature) {
param->monotone_constraints.resize(num_feature, 0);
}
template <typename ParamT, typename GpairT>
XGBOOST_DEVICE inline double CalcWeight(const ParamT &param, GpairT stats) const {
double w = xgboost::tree::CalcWeight(param, stats);
if (w < lower_bound) {
return lower_bound;
}
if (w > upper_bound) {
return upper_bound;
}
return w;
}

template <typename ParamT>
XGBOOST_DEVICE inline double CalcGain(const ParamT &param, tree::GradStats stats) const {
return tree::CalcGainGivenWeight<ParamT, float>(param, stats.sum_grad, stats.sum_hess,
CalcWeight(param, stats));
}

template <typename ParamT>
XGBOOST_DEVICE inline double CalcSplitGain(const ParamT &param, int constraint,
tree::GradStats left, tree::GradStats right) const {
const double negative_infinity = -std::numeric_limits<double>::infinity();
double wleft = CalcWeight(param, left);
double wright = CalcWeight(param, right);
double gain =
tree::CalcGainGivenWeight<ParamT, float>(param, left.sum_grad, left.sum_hess, wleft) +
tree::CalcGainGivenWeight<ParamT, float>(param, right.sum_grad, right.sum_hess, wright);
if (constraint == 0) {
return gain;
} else if (constraint > 0) {
return wleft <= wright ? gain : negative_infinity;
} else {
return wleft >= wright ? gain : negative_infinity;
}
}
template <typename GpairT>
void SetChild(const tree::TrainParam &param, bst_uint split_index,
GpairT left, GpairT right, ValueConstraint *cleft,
ValueConstraint *cright) {
int c = param.monotone_constraints.at(split_index);
*cleft = *this;
*cright = *this;
if (c == 0) {
return;
}
double wleft = CalcWeight(param, left);
double wright = CalcWeight(param, right);
double mid = (wleft + wright) / 2;
CHECK(!std::isnan(mid));
if (c < 0) {
cleft->lower_bound = mid;
cright->upper_bound = mid;
} else {
cleft->upper_bound = mid;
cright->lower_bound = mid;
}
}
};

// Feature interaction constraints built for GPU Hist updater.
struct FeatureInteractionConstraintDevice {
protected:
Expand Down
11 changes: 9 additions & 2 deletions src/tree/gpu_hist/driver.cuh
Expand Up @@ -14,9 +14,16 @@ struct ExpandEntry {
int nid;
int depth;
DeviceSplitCandidate split;

float base_weight { std::numeric_limits<float>::quiet_NaN() };
float left_weight { std::numeric_limits<float>::quiet_NaN() };
float right_weight { std::numeric_limits<float>::quiet_NaN() };

ExpandEntry() = default;
XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split)
: nid(nid), depth(depth), split(std::move(split)) {}
XGBOOST_DEVICE ExpandEntry(int nid, int depth, DeviceSplitCandidate split,
float base, float left, float right)
: nid(nid), depth(depth), split(std::move(split)), base_weight{base},
left_weight{left}, right_weight{right} {}
bool IsValid(const TrainParam& param, int num_leaves) const {
if (split.loss_chg <= kRtEps) return false;
if (split.left_sum.GetHess() == 0 || split.right_sum.GetHess() == 0) {
Expand Down
45 changes: 28 additions & 17 deletions src/tree/gpu_hist/evaluate_splits.cu
Expand Up @@ -9,19 +9,20 @@ namespace tree {

// With constraints
template <typename GradientPairT>
XGBOOST_DEVICE float LossChangeMissing(const GradientPairT& scan,
const GradientPairT& missing,
const GradientPairT& parent_sum,
const GPUTrainingParam& param,
int constraint,
const ValueConstraint& value_constraint,
bool& missing_left_out) { // NOLINT
XGBOOST_DEVICE float
LossChangeMissing(const GradientPairT &scan, const GradientPairT &missing,
const GradientPairT &parent_sum,
const GPUTrainingParam &param,
bst_node_t nidx,
bst_feature_t fidx,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
bool &missing_left_out) { // NOLINT
float parent_gain = CalcGain(param, parent_sum);
float missing_left_gain = value_constraint.CalcSplitGain(
param, constraint, GradStats(scan + missing),
GradStats(parent_sum - (scan + missing)));
float missing_right_gain = value_constraint.CalcSplitGain(
param, constraint, GradStats(scan), GradStats(parent_sum - scan));
float missing_left_gain =
evaluator.CalcSplitGain(param, nidx, fidx, GradStats(scan + missing),
GradStats(parent_sum - (scan + missing)));
float missing_right_gain = evaluator.CalcSplitGain(
param, nidx, fidx, GradStats(scan), GradStats(parent_sum - scan));

if (missing_left_gain >= missing_right_gain) {
missing_left_out = true;
Expand Down Expand Up @@ -74,6 +75,7 @@ template <int BLOCK_THREADS, typename ReduceT, typename ScanT,
typename MaxReduceT, typename TempStorageT, typename GradientSumT>
__device__ void EvaluateFeature(
int fidx, EvaluateSplitInputs<GradientSumT> inputs,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
DeviceSplitCandidate* best_split, // shared memory storing best split
TempStorageT* temp_storage // temp memory for cub operations
) {
Expand Down Expand Up @@ -107,8 +109,10 @@ __device__ void EvaluateFeature(
float gain = null_gain;
if (thread_active) {
gain = LossChangeMissing(bin, missing, inputs.parent_sum, inputs.param,
inputs.monotonic_constraints[fidx],
inputs.value_constraint, missing_left);
inputs.nidx,
fidx,
evaluator,
missing_left);
}

__syncthreads();
Expand Down Expand Up @@ -148,6 +152,7 @@ template <int BLOCK_THREADS, typename GradientSumT>
__global__ void EvaluateSplitsKernel(
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
common::Span<DeviceSplitCandidate> out_candidates) {
// KeyValuePair here used as threadIdx.x -> gain_value
using ArgMaxT = cub::KeyValuePair<int, float>;
Expand Down Expand Up @@ -183,7 +188,7 @@ __global__ void EvaluateSplitsKernel(
: blockIdx.x - left.feature_set.size()];

EvaluateFeature<BLOCK_THREADS, SumReduceT, BlockScanT, MaxReduceT>(
fidx, inputs, &best_split, &temp_storage);
fidx, inputs, evaluator, &best_split, &temp_storage);

__syncthreads();

Expand All @@ -200,6 +205,7 @@ __device__ DeviceSplitCandidate operator+(const DeviceSplitCandidate& a,

template <typename GradientSumT>
void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right) {
size_t combined_num_features =
Expand All @@ -209,7 +215,7 @@ void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
// One block for each feature
uint32_t constexpr kBlockThreads = 256;
dh::LaunchKernel {uint32_t(combined_num_features), kBlockThreads, 0}(
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right,
EvaluateSplitsKernel<kBlockThreads, GradientSumT>, left, right, evaluator,
dh::ToSpan(feature_best_splits));

// Reduce to get best candidate for left and right child over all features
Expand Down Expand Up @@ -240,23 +246,28 @@ void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,

template <typename GradientSumT>
void EvaluateSingleSplit(common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> input) {
EvaluateSplits(out_split, input, {});
EvaluateSplits(out_split, evaluator, input, {});
}

template void EvaluateSplits<GradientPair>(
common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPair> left,
EvaluateSplitInputs<GradientPair> right);
template void EvaluateSplits<GradientPairPrecise>(
common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPairPrecise> left,
EvaluateSplitInputs<GradientPairPrecise> right);
template void EvaluateSingleSplit<GradientPair>(
common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPair> input);
template void EvaluateSingleSplit<GradientPairPrecise>(
common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientPairPrecise> input);
} // namespace tree
} // namespace xgboost
5 changes: 3 additions & 2 deletions src/tree/gpu_hist/evaluate_splits.cuh
Expand Up @@ -5,6 +5,7 @@
#define EVALUATE_SPLITS_CUH_
#include <xgboost/span.h>
#include "../../data/ellpack_page.cuh"
#include "../split_evaluator.h"
#include "../constraints.cuh"
#include "../updater_gpu_common.cuh"

Expand All @@ -21,15 +22,15 @@ struct EvaluateSplitInputs {
common::Span<const float> feature_values;
common::Span<const float> min_fvalue;
common::Span<const GradientSumT> gradient_histogram;
ValueConstraint value_constraint;
common::Span<const int> monotonic_constraints;
};
template <typename GradientSumT>
void EvaluateSplits(common::Span<DeviceSplitCandidate> out_splits,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> left,
EvaluateSplitInputs<GradientSumT> right);
template <typename GradientSumT>
void EvaluateSingleSplit(common::Span<DeviceSplitCandidate> out_split,
TreeEvaluator::SplitEvaluator<GPUTrainingParam> evaluator,
EvaluateSplitInputs<GradientSumT> input);
} // namespace tree
} // namespace xgboost
Expand Down
2 changes: 1 addition & 1 deletion src/tree/gpu_hist/row_partitioner.cu
Expand Up @@ -81,7 +81,7 @@ void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
auto counting = thrust::make_counting_iterator(0llu);
auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
counting, [=] __device__(size_t idx) {
return IndexFlagTuple{idx, position[idx] == left_nidx};
return IndexFlagTuple{idx, static_cast<size_t>(position[idx] == left_nidx)};
});
size_t temp_bytes = 0;
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
Expand Down
3 changes: 1 addition & 2 deletions src/tree/gpu_hist/row_partitioner.cuh
Expand Up @@ -124,8 +124,7 @@ class RowPartitioner {
dh::safe_cuda(cudaMemcpyAsync(&left_count, d_left_count, sizeof(int64_t),
cudaMemcpyDeviceToHost, streams_[0]));

SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]
);
SortPositionAndCopy(segment, left_nidx, right_nidx, d_left_count, streams_[1]);

dh::safe_cuda(cudaStreamSynchronize(streams_[0]));
CHECK_LE(left_count, segment.Size());
Expand Down

0 comments on commit 2fcc4f2

Please sign in to comment.