Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization/buildhist/colwisebuildhist #8233

Merged
131 changes: 91 additions & 40 deletions src/common/hist_util.cc
Expand Up @@ -140,9 +140,9 @@ struct Prefetch {
constexpr size_t Prefetch::kNoPrefetchSize;

template <bool do_prefetch, typename BinIdxType, bool first_page, bool any_missing = true>
void BuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
void RowsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
Expand Down Expand Up @@ -204,69 +204,120 @@ void BuildHistKernel(const std::vector<GradientPair> &gpair,
}
}

template <typename BinIdxType, bool first_page, bool any_missing>
void ColsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();

auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](size_t ridx) {
return first_page ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](size_t ridx) {
return first_page ? ridx : (ridx - base_rowid);
};

const size_t n_features = gmat.cut.Ptrs().size() - 1;
const size_t n_columns = n_features;
auto hist_data = reinterpret_cast<double *>(hist.data());
const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
for (size_t cid = 0; cid < n_columns; ++cid) {
const uint32_t offset = any_missing ? 0 : offsets[cid];
for (size_t i = 0; i < size; ++i) {
const size_t row_id = rid[i];
const size_t icol_start =
any_missing ? get_row_ptr(row_id) : get_rid(row_id) * n_features;

const BinIdxType *gr_index_local = gradient_index + icol_start;
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[cid]) + offset);
auto hist_local = hist_data + idx_bin;

const size_t idx_gh = two * row_id;
// The trick with pgh_t buffer helps the compiler to generate faster binary.
const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
*(hist_local) += pgh_t[0];
*(hist_local + 1) += pgh_t[1];
}
}
}

template <bool do_prefetch, typename BinIdxType, bool first_page,
bool any_missing>
void BuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist, bool read_by_column) {
if (read_by_column) {
ColsWiseBuildHistKernel<BinIdxType, first_page, any_missing>
(gpair, row_indices, gmat, hist);
} else {
RowsWiseBuildHistKernel<do_prefetch, BinIdxType, first_page, any_missing>
(gpair, row_indices, gmat, hist);
}
}

template <bool do_prefetch, bool any_missing>
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
trivialfis marked this conversation as resolved.
Show resolved Hide resolved
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
GHistRow hist, bool read_by_column) {
auto first_page = gmat.base_rowid == 0;
if (first_page) {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<do_prefetch, uint8_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<do_prefetch, uint16_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<do_prefetch, uint32_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
}
} else {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<do_prefetch, uint8_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<do_prefetch, uint16_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<do_prefetch, uint32_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) {
using BinIdxType = decltype(t);
if (first_page) {
BuildHistKernel<do_prefetch, BinIdxType, true, any_missing>
(gpair, row_indices, gmat, hist, read_by_column);
} else {
BuildHistKernel<do_prefetch, BinIdxType, false, any_missing>
(gpair, row_indices, gmat, hist, read_by_column);
}
}
});
}

template <bool any_missing>
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) const {
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist, bool read_by_column) {
const size_t nrows = row_indices.Size();
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock =
(row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistDispatch<false, any_missing>(gpair, row_indices,
gmat, hist);
BuildHistDispatch<false, any_missing>(gpair, row_indices, gmat, hist, read_by_column);
} else {
const RowSetCollection::Elem span1(row_indices.begin,
row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size,
row_indices.end);

BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist);
BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist, read_by_column);
// no prefetching to avoid loading extra memory
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist);
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist, read_by_column);
}
}

template <bool any_missing>
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRow hist) const {
constexpr double kAdhocL2Size = 1024 * 1024 * 0.8;
const bool hist_fit_to_l2 = kAdhocL2Size > 2*sizeof(float)*gmat.cut.Ptrs().back();
bool read_by_column = !hist_fit_to_l2 && !any_missing;

BuildHistDispatch<any_missing>(gpair, row_indices, gmat, hist, read_by_column);
}

template void GHistBuilder::BuildHist<true>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist) const;
Expand Down