Skip to content

Commit

Permalink
Simplify sparse and dense CPU hist kernels (#7029)
Browse files Browse the repository at this point in the history
* Simplify sparse and dense kernels
* Extract row partitioner.

Co-authored-by: Kirill Shvets <kirill.shvets@intel.com>
  • Loading branch information
ShvetsKS and Kirill Shvets committed Jun 11, 2021
1 parent 1faad82 commit 2567404
Show file tree
Hide file tree
Showing 10 changed files with 368 additions and 433 deletions.
46 changes: 43 additions & 3 deletions src/common/column_matrix.h
Expand Up @@ -30,6 +30,8 @@ enum ColumnType {
template <typename BinIdxType>
class Column {
public:
static constexpr int32_t kMissingId = -1;

Column(ColumnType type, common::Span<const BinIdxType> index, const uint32_t index_base)
: type_(type),
index_(index),
Expand Down Expand Up @@ -71,6 +73,30 @@ class SparseColumn: public Column<BinIdxType> {

const size_t* GetRowData() const { return row_ind_.data(); }

int32_t GetBinIdx(size_t rid, size_t* state) const {
const size_t column_size = this->Size();
if (!((*state) < column_size)) {
return this->kMissingId;
}
while ((*state) < column_size && GetRowIdx(*state) < rid) {
++(*state);
}
if (((*state) < column_size) && GetRowIdx(*state) == rid) {
return this->GetGlobalBinIdx(*state);
} else {
return this->kMissingId;
}
}

size_t GetInitialState(const size_t first_row_id) const {
const size_t* row_data = GetRowData();
const size_t column_size = this->Size();
// search first nonzero row with index >= rid_span.front()
const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id);
// column_size if all messing
return p - row_data;
}

size_t GetRowIdx(size_t idx) const {
return row_ind_.data()[idx];
}
Expand All @@ -80,7 +106,7 @@ class SparseColumn: public Column<BinIdxType> {
common::Span<const size_t> row_ind_;
};

template <typename BinIdxType>
template <typename BinIdxType, bool any_missing>
class DenseColumn: public Column<BinIdxType> {
public:
DenseColumn(ColumnType type, common::Span<const BinIdxType> index,
Expand All @@ -90,6 +116,19 @@ class DenseColumn: public Column<BinIdxType> {
missing_flags_(missing_flags),
feature_offset_(feature_offset) {}
bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; }

int32_t GetBinIdx(size_t idx, size_t* state) const {
if (any_missing) {
return IsMissing(idx) ? this->kMissingId : this->GetGlobalBinIdx(idx);
} else {
return this->GetGlobalBinIdx(idx);
}
}

size_t GetInitialState(const size_t first_row_id) const {
return 0;
}

private:
/* flags for missing values in dense columns */
const std::vector<bool>& missing_flags_;
Expand Down Expand Up @@ -202,7 +241,7 @@ class ColumnMatrix {

/* Fetch an individual column. This code should be used with type swith
to determine type of bin id's */
template <typename BinIdxType>
template <typename BinIdxType, bool any_missing>
std::unique_ptr<const Column<BinIdxType> > GetColumn(unsigned fid) const {
CHECK_EQ(sizeof(BinIdxType), bins_type_size_);

Expand All @@ -213,7 +252,8 @@ class ColumnMatrix {
column_size };
std::unique_ptr<const Column<BinIdxType> > res;
if (type_[fid] == ColumnType::kDenseColumn) {
res.reset(new DenseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
CHECK_EQ(any_missing, any_missing_);
res.reset(new DenseColumn<BinIdxType, any_missing>(type_[fid], bin_index, index_base_[fid],
missing_flags_, feature_offset));
} else {
res.reset(new SparseColumn<BinIdxType>(type_[fid], bin_index, index_base_[fid],
Expand Down
131 changes: 46 additions & 85 deletions src/common/hist_util.cc
Expand Up @@ -287,130 +287,83 @@ struct Prefetch {
constexpr size_t Prefetch::kNoPrefetchSize;


template<typename FPType, bool do_prefetch, typename BinIdxType>
void BuildHistDenseKernel(const std::vector<GradientPair>& gpair,
template<typename FPType, bool do_prefetch, typename BinIdxType, bool any_missing = true>
void BuildHistKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
const size_t n_features,
GHistRow<FPType> hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const BinIdxType* gradient_index = gmat.index.data<BinIdxType>();
const size_t* row_ptr = gmat.row_ptr.data();
const uint32_t* offsets = gmat.index.Offset();
const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array

for (size_t i = 0; i < size; ++i) {
const size_t icol_start = rid[i] * n_features;
const size_t icol_start = any_missing ? row_ptr[rid[i]] : rid[i] * n_features;
const size_t icol_end = any_missing ? row_ptr[rid[i]+1] : icol_start + n_features;
const size_t row_size = icol_end - icol_start;
const size_t idx_gh = two * rid[i];

if (do_prefetch) {
const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features;
const size_t icol_start_prftch = any_missing ? row_ptr[rid[i+Prefetch::kPrefetchOffset]] :
rid[i + Prefetch::kPrefetchOffset] * n_features;
const size_t icol_end_prefect = any_missing ? row_ptr[rid[i+Prefetch::kPrefetchOffset]+1] :
icol_start_prftch + n_features;

PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features;
j += Prefetch::GetPrefetchStep<BinIdxType>()) {
for (size_t j = icol_start_prftch; j < icol_end_prefect;
j+=Prefetch::GetPrefetchStep<uint32_t>()) {
PREFETCH_READ_T0(gradient_index + j);
}
}
const BinIdxType* gr_index_local = gradient_index + icol_start;
for (size_t j = 0; j < n_features; ++j) {
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) +
offsets[j]);

hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}

template<typename FPType, bool do_prefetch>
void BuildHistSparseKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow<FPType> hist) {
const size_t size = row_indices.Size();
const size_t* rid = row_indices.begin;
const float* pgh = reinterpret_cast<const float*>(gpair.data());
const uint32_t* gradient_index = gmat.index.data<uint32_t>();
const size_t* row_ptr = gmat.row_ptr.data();
FPType* hist_data = reinterpret_cast<FPType*>(hist.data());
const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array

for (size_t i = 0; i < size; ++i) {
const size_t icol_start = row_ptr[rid[i]];
const size_t icol_end = row_ptr[rid[i]+1];
const size_t idx_gh = two * rid[i];
for (size_t j = 0; j < row_size; ++j) {
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[j]) + (
any_missing ? 0 : offsets[j]));

if (do_prefetch) {
const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]];
const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1];

PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prftch; j < icol_end_prefect;
j+=Prefetch::GetPrefetchStep<uint32_t>()) {
PREFETCH_READ_T0(gradient_index + j);
}
}
for (size_t j = icol_start; j < icol_end; ++j) {
const uint32_t idx_bin = two * gradient_index[j];
hist_data[idx_bin] += pgh[idx_gh];
hist_data[idx_bin+1] += pgh[idx_gh+1];
}
}
}


template<typename FPType, bool do_prefetch, typename BinIdxType>
void BuildHistDispatchKernel(const std::vector<GradientPair>& gpair,
template<typename FPType, bool do_prefetch, bool any_missing>
void BuildHistDispatch(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, GHistRow<FPType> hist, bool isDense) {
if (isDense) {
const size_t* row_ptr = gmat.row_ptr.data();
const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]];
BuildHistDenseKernel<FPType, do_prefetch, BinIdxType>(gpair, row_indices,
gmat, n_features, hist);
} else {
BuildHistSparseKernel<FPType, do_prefetch>(gpair, row_indices,
gmat, hist);
}
}

template<typename FPType, bool do_prefetch>
void BuildHistKernel(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, const bool isDense, GHistRow<FPType> hist) {
const bool is_dense = row_indices.Size() && isDense;
const GHistIndexMatrix& gmat, GHistRow<FPType> hist) {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistDispatchKernel<FPType, do_prefetch, uint8_t>(gpair, row_indices,
gmat, hist, is_dense);
BuildHistKernel<FPType, do_prefetch, uint8_t, any_missing>(gpair, row_indices,
gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistDispatchKernel<FPType, do_prefetch, uint16_t>(gpair, row_indices,
gmat, hist, is_dense);
BuildHistKernel<FPType, do_prefetch, uint16_t, any_missing>(gpair, row_indices,
gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistDispatchKernel<FPType, do_prefetch, uint32_t>(gpair, row_indices,
gmat, hist, is_dense);
BuildHistKernel<FPType, do_prefetch, uint32_t, any_missing>(gpair, row_indices,
gmat, hist);
break;
default:
CHECK(false); // no default behavior
}
}

template <typename GradientSumT>
template <bool any_missing>
void GHistBuilder<GradientSumT>::BuildHist(
const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRowT hist, bool isDense) {
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRowT hist) {
const size_t nrows = row_indices.Size();
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

Expand All @@ -419,28 +372,36 @@ void GHistBuilder<GradientSumT>::BuildHist(

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistKernel<GradientSumT, false>(gpair, row_indices, gmat, isDense, hist);
BuildHistDispatch<GradientSumT, false, any_missing>(gpair, row_indices, gmat, hist);
} else {
const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end);

BuildHistKernel<GradientSumT, true>(gpair, span1, gmat, isDense, hist);
BuildHistDispatch<GradientSumT, true, any_missing>(gpair, span1, gmat, hist);
// no prefetching to avoid loading extra memory
BuildHistKernel<GradientSumT, false>(gpair, span2, gmat, isDense, hist);
BuildHistDispatch<GradientSumT, false, any_missing>(gpair, span2, gmat, hist);
}
}
template
void GHistBuilder<float>::BuildHist(const std::vector<GradientPair>& gpair,
void GHistBuilder<float>::BuildHist<true>(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow<float> hist);
template
void GHistBuilder<float>::BuildHist<false>(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow<float> hist);
template
void GHistBuilder<double>::BuildHist<true>(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow<float> hist,
bool isDense);
GHistRow<double> hist);
template
void GHistBuilder<double>::BuildHist(const std::vector<GradientPair>& gpair,
void GHistBuilder<double>::BuildHist<false>(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRow<double> hist,
bool isDense);
GHistRow<double> hist);

template<typename GradientSumT>
void GHistBuilder<GradientSumT>::SubtractionTrick(GHistRowT self,
Expand Down
4 changes: 2 additions & 2 deletions src/common/hist_util.h
Expand Up @@ -627,11 +627,11 @@ class GHistBuilder {
GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {}

// construct a histogram via histogram aggregation
template <bool any_missing>
void BuildHist(const std::vector<GradientPair>& gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat,
GHistRowT hist,
bool isDense);
GHistRowT hist);
// construct a histogram via subtraction trick
void SubtractionTrick(GHistRowT self,
GHistRowT sibling,
Expand Down

0 comments on commit 2567404

Please sign in to comment.