Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimization/buildhist/colwisebuildhist #8233

Merged
145 changes: 103 additions & 42 deletions src/common/hist_util.cc
Expand Up @@ -140,9 +140,9 @@ struct Prefetch {
constexpr size_t Prefetch::kNoPrefetchSize;

template <bool do_prefetch, typename BinIdxType, bool first_page, bool any_missing = true>
void BuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
void RowsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
Expand Down Expand Up @@ -204,75 +204,136 @@ void BuildHistKernel(const std::vector<GradientPair> &gpair,
}
}

template <typename BinIdxType, bool first_page, bool any_missing>
void ColsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();

auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](size_t ridx) {
return first_page ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](size_t ridx) {
return first_page ? ridx : (ridx - base_rowid);
};

const size_t n_features = gmat.cut.Ptrs().size() - 1;
const size_t n_columns = n_features;
auto hist_data = reinterpret_cast<double *>(hist.data());
const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
for (size_t cid = 0; cid < n_columns; ++cid) {
const uint32_t offset = any_missing ? 0 : offsets[cid];
for (size_t i = 0; i < size; ++i) {
const size_t row_id = rid[i];
const size_t icol_start =
any_missing ? get_row_ptr(row_id) : get_rid(row_id) * n_features;
const size_t icol_end =
any_missing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;

if (cid < icol_end - icol_start) {
const BinIdxType *gr_index_local = gradient_index + icol_start;
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[cid]) + offset);
auto hist_local = hist_data + idx_bin;

const size_t idx_gh = two * row_id;
// The trick with pgh_t buffer helps the compiler to generate faster binary.
const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
*(hist_local) += pgh_t[0];
*(hist_local + 1) += pgh_t[1];
}
}
}
}

template <bool do_prefetch, typename BinIdxType, bool first_page,
bool any_missing>
void BuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist, bool read_by_column) {
if (read_by_column) {
ColsWiseBuildHistKernel<BinIdxType, first_page, any_missing>
(gpair, row_indices, gmat, hist);
} else {
RowsWiseBuildHistKernel<do_prefetch, BinIdxType, first_page, any_missing>
(gpair, row_indices, gmat, hist);
}
}

template <bool do_prefetch, bool any_missing>
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
trivialfis marked this conversation as resolved.
Show resolved Hide resolved
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
GHistRow hist, bool read_by_column) {
auto first_page = gmat.base_rowid == 0;
if (first_page) {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<do_prefetch, uint8_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<do_prefetch, uint16_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<do_prefetch, uint32_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) {
using BinIdxType = decltype(t);
if (first_page) {
BuildHistKernel<do_prefetch, BinIdxType, true, any_missing>
(gpair, row_indices, gmat, hist, read_by_column);
} else {
BuildHistKernel<do_prefetch, BinIdxType, false, any_missing>
(gpair, row_indices, gmat, hist, read_by_column);
}
} else {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<do_prefetch, uint8_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<do_prefetch, uint16_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<do_prefetch, uint32_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
}
}
});
}

template <bool any_missing>
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) const {
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist, bool read_by_column) {
const size_t nrows = row_indices.Size();
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock =
(row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistDispatch<false, any_missing>(gpair, row_indices,
gmat, hist);
BuildHistDispatch<false, any_missing>(gpair, row_indices, gmat, hist, read_by_column);
} else {
const RowSetCollection::Elem span1(row_indices.begin,
row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size,
row_indices.end);

BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist);
BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist, read_by_column);
// no prefetching to avoid loading extra memory
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist);
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist, read_by_column);
}
}

template <bool any_missing>
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRow hist, bool force_read_by_column) const {
/* force_read_by_column is used for testing the columnwise building of histograms.
* default force_read_by_column = false
*/
constexpr double kAdhocL2Size = 1024 * 1024 * 0.8;
const bool hist_fit_to_l2 = kAdhocL2Size > 2*sizeof(float)*gmat.cut.Ptrs().back();
const bool read_by_column = !hist_fit_to_l2 && !any_missing;

BuildHistDispatch<any_missing>(gpair, row_indices, gmat, hist, read_by_column ||
force_read_by_column);
}

template void GHistBuilder::BuildHist<true>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist) const;
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column) const;

template void GHistBuilder::BuildHist<false>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist) const;
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column) const;
} // namespace common
} // namespace xgboost
3 changes: 2 additions & 1 deletion src/common/hist_util.h
Expand Up @@ -623,7 +623,8 @@ class GHistBuilder {
// construct a histogram via histogram aggregation
template <bool any_missing>
void BuildHist(const std::vector<GradientPair>& gpair, const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, GHistRow hist) const;
const GHistIndexMatrix& gmat, GHistRow hist,
bool force_read_by_column = false) const;
uint32_t GetNumBins() const {
return nbins_;
}
Expand Down
20 changes: 13 additions & 7 deletions src/tree/hist/histogram.h
Expand Up @@ -59,7 +59,8 @@ class HistogramBuilder {
GHistIndexMatrix const &gidx,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
common::RowSetCollection const &row_set_collection,
const std::vector<GradientPair> &gpair_h) {
const std::vector<GradientPair> &gpair_h,
bool force_read_by_column) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
CHECK_GT(n_nodes, 0);

Expand All @@ -86,7 +87,8 @@ class HistogramBuilder {
elem.begin + end_of_row_set, nid);
auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
if (rid_set.Size() != 0) {
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist);
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist,
force_read_by_column);
}
});
}
Expand All @@ -112,7 +114,8 @@ class HistogramBuilder {
RegTree *p_tree, common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
std::vector<GradientPair> const &gpair) {
std::vector<GradientPair> const &gpair,
bool force_read_by_column = false) {
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
if (page_id == 0) {
Expand All @@ -123,11 +126,13 @@ class HistogramBuilder {
if (gidx.IsDense()) {
this->BuildLocalHistograms<false>(page_id, space, gidx,
nodes_for_explicit_hist_build,
row_set_collection, gpair);
row_set_collection, gpair,
force_read_by_column);
} else {
this->BuildLocalHistograms<true>(page_id, space, gidx,
nodes_for_explicit_hist_build,
row_set_collection, gpair);
row_set_collection, gpair,
force_read_by_column);
}

CHECK_GE(n_batches_, 1);
Expand All @@ -148,7 +153,8 @@ class HistogramBuilder {
common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
std::vector<GradientPair> const &gpair) {
std::vector<GradientPair> const &gpair,
bool force_read_by_column = false) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
// create space of size (# rows in each node)
common::BlockedSpace2d space(
Expand All @@ -160,7 +166,7 @@ class HistogramBuilder {
256);
this->BuildHist(page_id, space, gidx, p_tree, row_set_collection,
nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
gpair);
gpair, force_read_by_column);
}

void SyncHistogramDistributed(
Expand Down
9 changes: 6 additions & 3 deletions tests/cpp/tree/hist/test_evaluate_splits.cc
Expand Up @@ -12,7 +12,7 @@

namespace xgboost {
namespace tree {
void TestEvaluateSplits() {
void TestEvaluateSplits(bool force_read_by_column) {
int static constexpr kRows = 8, kCols = 16;
auto orig = omp_get_max_threads();
int32_t n_threads = std::min(omp_get_max_threads(), 4);
Expand Down Expand Up @@ -44,7 +44,7 @@ void TestEvaluateSplits() {
hist.AddHistRow(0);
hist.AllocateAllData();
hist_builder.template BuildHist<false>(row_gpairs, row_set_collection[0],
gmat, hist[0]);
gmat, hist[0], force_read_by_column);

// Compute total gradient for all data points
GradientPairPrecise total_gpair;
Expand Down Expand Up @@ -84,7 +84,10 @@ void TestEvaluateSplits() {
omp_set_num_threads(orig);
}

TEST(HistEvaluator, Evaluate) { TestEvaluateSplits(); }
TEST(HistEvaluator, Evaluate) {
TestEvaluateSplits(false);
TestEvaluateSplits(true);
}

TEST(HistEvaluator, Apply) {
RegTree tree;
Expand Down