Skip to content

Commit

Permalink
Optional by-column histogram build. (#8233)
Browse files Browse the repository at this point in the history
Co-authored-by: dmitry.razdoburdin <drazdobu@jfldaal005.jf.intel.com>
  • Loading branch information
razdoburdin and dmitry.razdoburdin committed Sep 21, 2022
1 parent b791446 commit eb7bbee
Show file tree
Hide file tree
Showing 5 changed files with 149 additions and 67 deletions.
145 changes: 103 additions & 42 deletions src/common/hist_util.cc
Expand Up @@ -140,9 +140,9 @@ struct Prefetch {
constexpr size_t Prefetch::kNoPrefetchSize;

template <bool do_prefetch, typename BinIdxType, bool first_page, bool any_missing = true>
void BuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
void RowsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
Expand Down Expand Up @@ -204,75 +204,136 @@ void BuildHistKernel(const std::vector<GradientPair> &gpair,
}
}

template <typename BinIdxType, bool first_page, bool any_missing>
void ColsWiseBuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
const size_t size = row_indices.Size();
const size_t *rid = row_indices.begin;
auto const *pgh = reinterpret_cast<const float *>(gpair.data());
const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();

auto const &row_ptr = gmat.row_ptr.data();
auto base_rowid = gmat.base_rowid;
const uint32_t *offsets = gmat.index.Offset();
auto get_row_ptr = [&](size_t ridx) {
return first_page ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](size_t ridx) {
return first_page ? ridx : (ridx - base_rowid);
};

const size_t n_features = gmat.cut.Ptrs().size() - 1;
const size_t n_columns = n_features;
auto hist_data = reinterpret_cast<double *>(hist.data());
const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains
// 2 FP values: gradient and hessian.
// So we need to multiply each row-index/bin-index by 2
// to work with gradient pairs as a singe row FP array
for (size_t cid = 0; cid < n_columns; ++cid) {
const uint32_t offset = any_missing ? 0 : offsets[cid];
for (size_t i = 0; i < size; ++i) {
const size_t row_id = rid[i];
const size_t icol_start =
any_missing ? get_row_ptr(row_id) : get_rid(row_id) * n_features;
const size_t icol_end =
any_missing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;

if (cid < icol_end - icol_start) {
const BinIdxType *gr_index_local = gradient_index + icol_start;
const uint32_t idx_bin = two * (static_cast<uint32_t>(gr_index_local[cid]) + offset);
auto hist_local = hist_data + idx_bin;

const size_t idx_gh = two * row_id;
// The trick with pgh_t buffer helps the compiler to generate faster binary.
const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
*(hist_local) += pgh_t[0];
*(hist_local + 1) += pgh_t[1];
}
}
}
}

template <bool do_prefetch, typename BinIdxType, bool first_page,
bool any_missing>
void BuildHistKernel(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist, bool read_by_column) {
if (read_by_column) {
ColsWiseBuildHistKernel<BinIdxType, first_page, any_missing>
(gpair, row_indices, gmat, hist);
} else {
RowsWiseBuildHistKernel<do_prefetch, BinIdxType, first_page, any_missing>
(gpair, row_indices, gmat, hist);
}
}

template <bool do_prefetch, bool any_missing>
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) {
GHistRow hist, bool read_by_column) {
auto first_page = gmat.base_rowid == 0;
if (first_page) {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<do_prefetch, uint8_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<do_prefetch, uint16_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<do_prefetch, uint32_t, true, any_missing>(gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) {
using BinIdxType = decltype(t);
if (first_page) {
BuildHistKernel<do_prefetch, BinIdxType, true, any_missing>
(gpair, row_indices, gmat, hist, read_by_column);
} else {
BuildHistKernel<do_prefetch, BinIdxType, false, any_missing>
(gpair, row_indices, gmat, hist, read_by_column);
}
} else {
switch (gmat.index.GetBinTypeSize()) {
case kUint8BinsTypeSize:
BuildHistKernel<do_prefetch, uint8_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint16BinsTypeSize:
BuildHistKernel<do_prefetch, uint16_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
case kUint32BinsTypeSize:
BuildHistKernel<do_prefetch, uint32_t, false, any_missing>(gpair, row_indices, gmat, hist);
break;
default:
CHECK(false); // no default behavior
}
}
});
}

template <bool any_missing>
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist) const {
void BuildHistDispatch(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
GHistRow hist, bool read_by_column) {
const size_t nrows = row_indices.Size();
const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);

// if need to work with all rows from bin-matrix (e.g. root node)
const bool contiguousBlock =
(row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);

if (contiguousBlock) {
// contiguous memory access, built-in HW prefetching is enough
BuildHistDispatch<false, any_missing>(gpair, row_indices,
gmat, hist);
BuildHistDispatch<false, any_missing>(gpair, row_indices, gmat, hist, read_by_column);
} else {
const RowSetCollection::Elem span1(row_indices.begin,
row_indices.end - no_prefetch_size);
const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size,
row_indices.end);

BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist);
BuildHistDispatch<true, any_missing>(gpair, span1, gmat, hist, read_by_column);
// no prefetching to avoid loading extra memory
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist);
BuildHistDispatch<false, any_missing>(gpair, span2, gmat, hist, read_by_column);
}
}

template <bool any_missing>
void GHistBuilder::BuildHist(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat,
GHistRow hist, bool force_read_by_column) const {
/* force_read_by_column is used for testing the columnwise building of histograms.
* default force_read_by_column = false
*/
constexpr double kAdhocL2Size = 1024 * 1024 * 0.8;
const bool hist_fit_to_l2 = kAdhocL2Size > 2*sizeof(float)*gmat.cut.Ptrs().back();
const bool read_by_column = !hist_fit_to_l2 && !any_missing;

BuildHistDispatch<any_missing>(gpair, row_indices, gmat, hist, read_by_column ||
force_read_by_column);
}

template void GHistBuilder::BuildHist<true>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist) const;
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column) const;

template void GHistBuilder::BuildHist<false>(const std::vector<GradientPair> &gpair,
const RowSetCollection::Elem row_indices,
const GHistIndexMatrix &gmat, GHistRow hist) const;
const GHistIndexMatrix &gmat, GHistRow hist,
bool force_read_by_column) const;
} // namespace common
} // namespace xgboost
3 changes: 2 additions & 1 deletion src/common/hist_util.h
Expand Up @@ -623,7 +623,8 @@ class GHistBuilder {
// construct a histogram via histogram aggregation
template <bool any_missing>
void BuildHist(const std::vector<GradientPair>& gpair, const RowSetCollection::Elem row_indices,
const GHistIndexMatrix& gmat, GHistRow hist) const;
const GHistIndexMatrix& gmat, GHistRow hist,
bool force_read_by_column = false) const;
uint32_t GetNumBins() const {
return nbins_;
}
Expand Down
20 changes: 13 additions & 7 deletions src/tree/hist/histogram.h
Expand Up @@ -59,7 +59,8 @@ class HistogramBuilder {
GHistIndexMatrix const &gidx,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
common::RowSetCollection const &row_set_collection,
const std::vector<GradientPair> &gpair_h) {
const std::vector<GradientPair> &gpair_h,
bool force_read_by_column) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
CHECK_GT(n_nodes, 0);

Expand All @@ -86,7 +87,8 @@ class HistogramBuilder {
elem.begin + end_of_row_set, nid);
auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
if (rid_set.Size() != 0) {
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist);
builder_.template BuildHist<any_missing>(gpair_h, rid_set, gidx, hist,
force_read_by_column);
}
});
}
Expand All @@ -112,7 +114,8 @@ class HistogramBuilder {
RegTree *p_tree, common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
std::vector<GradientPair> const &gpair) {
std::vector<GradientPair> const &gpair,
bool force_read_by_column = false) {
int starting_index = std::numeric_limits<int>::max();
int sync_count = 0;
if (page_id == 0) {
Expand All @@ -123,11 +126,13 @@ class HistogramBuilder {
if (gidx.IsDense()) {
this->BuildLocalHistograms<false>(page_id, space, gidx,
nodes_for_explicit_hist_build,
row_set_collection, gpair);
row_set_collection, gpair,
force_read_by_column);
} else {
this->BuildLocalHistograms<true>(page_id, space, gidx,
nodes_for_explicit_hist_build,
row_set_collection, gpair);
row_set_collection, gpair,
force_read_by_column);
}

CHECK_GE(n_batches_, 1);
Expand All @@ -148,7 +153,8 @@ class HistogramBuilder {
common::RowSetCollection const &row_set_collection,
std::vector<ExpandEntry> const &nodes_for_explicit_hist_build,
std::vector<ExpandEntry> const &nodes_for_subtraction_trick,
std::vector<GradientPair> const &gpair) {
std::vector<GradientPair> const &gpair,
bool force_read_by_column = false) {
const size_t n_nodes = nodes_for_explicit_hist_build.size();
// create space of size (# rows in each node)
common::BlockedSpace2d space(
Expand All @@ -160,7 +166,7 @@ class HistogramBuilder {
256);
this->BuildHist(page_id, space, gidx, p_tree, row_set_collection,
nodes_for_explicit_hist_build, nodes_for_subtraction_trick,
gpair);
gpair, force_read_by_column);
}

void SyncHistogramDistributed(
Expand Down
9 changes: 6 additions & 3 deletions tests/cpp/tree/hist/test_evaluate_splits.cc
Expand Up @@ -12,7 +12,7 @@

namespace xgboost {
namespace tree {
void TestEvaluateSplits() {
void TestEvaluateSplits(bool force_read_by_column) {
int static constexpr kRows = 8, kCols = 16;
auto orig = omp_get_max_threads();
int32_t n_threads = std::min(omp_get_max_threads(), 4);
Expand Down Expand Up @@ -44,7 +44,7 @@ void TestEvaluateSplits() {
hist.AddHistRow(0);
hist.AllocateAllData();
hist_builder.template BuildHist<false>(row_gpairs, row_set_collection[0],
gmat, hist[0]);
gmat, hist[0], force_read_by_column);

// Compute total gradient for all data points
GradientPairPrecise total_gpair;
Expand Down Expand Up @@ -84,7 +84,10 @@ void TestEvaluateSplits() {
omp_set_num_threads(orig);
}

TEST(HistEvaluator, Evaluate) { TestEvaluateSplits(); }
TEST(HistEvaluator, Evaluate) {
TestEvaluateSplits(false);
TestEvaluateSplits(true);
}

TEST(HistEvaluator, Apply) {
RegTree tree;
Expand Down

0 comments on commit eb7bbee

Please sign in to comment.