diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index e55477c95de2..42e3ff9ce202 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -140,9 +140,9 @@ struct Prefetch { constexpr size_t Prefetch::kNoPrefetchSize; template -void BuildHistKernel(const std::vector &gpair, - const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, - GHistRow hist) { +void RowsWiseBuildHistKernel(const std::vector &gpair, + const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, + GHistRow hist) { const size_t size = row_indices.Size(); const size_t *rid = row_indices.begin; auto const *pgh = reinterpret_cast(gpair.data()); @@ -204,75 +204,136 @@ void BuildHistKernel(const std::vector &gpair, } } +template +void ColsWiseBuildHistKernel(const std::vector &gpair, + const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, + GHistRow hist) { + const size_t size = row_indices.Size(); + const size_t *rid = row_indices.begin; + auto const *pgh = reinterpret_cast(gpair.data()); + const BinIdxType *gradient_index = gmat.index.data(); + + auto const &row_ptr = gmat.row_ptr.data(); + auto base_rowid = gmat.base_rowid; + const uint32_t *offsets = gmat.index.Offset(); + auto get_row_ptr = [&](size_t ridx) { + return first_page ? row_ptr[ridx] : row_ptr[ridx - base_rowid]; + }; + auto get_rid = [&](size_t ridx) { + return first_page ? ridx : (ridx - base_rowid); + }; + + const size_t n_features = gmat.cut.Ptrs().size() - 1; + const size_t n_columns = n_features; + auto hist_data = reinterpret_cast(hist.data()); + const uint32_t two{2}; // Each element from 'gpair' and 'hist' contains + // 2 FP values: gradient and hessian. + // So we need to multiply each row-index/bin-index by 2 + // to work with gradient pairs as a singe row FP array + for (size_t cid = 0; cid < n_columns; ++cid) { + const uint32_t offset = any_missing ? 0 : offsets[cid]; + for (size_t i = 0; i < size; ++i) { + const size_t row_id = rid[i]; + const size_t icol_start = + any_missing ? get_row_ptr(row_id) : get_rid(row_id) * n_features; + const size_t icol_end = + any_missing ? get_row_ptr(rid[i] + 1) : icol_start + n_features; + + if (cid < icol_end - icol_start) { + const BinIdxType *gr_index_local = gradient_index + icol_start; + const uint32_t idx_bin = two * (static_cast(gr_index_local[cid]) + offset); + auto hist_local = hist_data + idx_bin; + + const size_t idx_gh = two * row_id; + // The trick with pgh_t buffer helps the compiler to generate faster binary. + const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]}; + *(hist_local) += pgh_t[0]; + *(hist_local + 1) += pgh_t[1]; + } + } + } +} + +template +void BuildHistKernel(const std::vector &gpair, + const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, + GHistRow hist, bool read_by_column) { + if (read_by_column) { + ColsWiseBuildHistKernel + (gpair, row_indices, gmat, hist); + } else { + RowsWiseBuildHistKernel + (gpair, row_indices, gmat, hist); + } +} + template void BuildHistDispatch(const std::vector &gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, - GHistRow hist) { + GHistRow hist, bool read_by_column) { auto first_page = gmat.base_rowid == 0; - if (first_page) { - switch (gmat.index.GetBinTypeSize()) { - case kUint8BinsTypeSize: - BuildHistKernel(gpair, row_indices, gmat, hist); - break; - case kUint16BinsTypeSize: - BuildHistKernel(gpair, row_indices, gmat, hist); - break; - case kUint32BinsTypeSize: - BuildHistKernel(gpair, row_indices, gmat, hist); - break; - default: - CHECK(false); // no default behavior + DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) { + using BinIdxType = decltype(t); + if (first_page) { + BuildHistKernel + (gpair, row_indices, gmat, hist, read_by_column); + } else { + BuildHistKernel + (gpair, row_indices, gmat, hist, read_by_column); } - } else { - switch (gmat.index.GetBinTypeSize()) { - case kUint8BinsTypeSize: - BuildHistKernel(gpair, row_indices, gmat, hist); - break; - case kUint16BinsTypeSize: - BuildHistKernel(gpair, row_indices, gmat, hist); - break; - case kUint32BinsTypeSize: - BuildHistKernel(gpair, row_indices, gmat, hist); - break; - default: - CHECK(false); // no default behavior - } - } + }); } template -void GHistBuilder::BuildHist(const std::vector &gpair, - const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, - GHistRow hist) const { +void BuildHistDispatch(const std::vector &gpair, + const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, + GHistRow hist, bool read_by_column) { const size_t nrows = row_indices.Size(); const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows); - // if need to work with all rows from bin-matrix (e.g. root node) const bool contiguousBlock = (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1); if (contiguousBlock) { // contiguous memory access, built-in HW prefetching is enough - BuildHistDispatch(gpair, row_indices, - gmat, hist); + BuildHistDispatch(gpair, row_indices, gmat, hist, read_by_column); } else { const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size); const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end); - BuildHistDispatch(gpair, span1, gmat, hist); + BuildHistDispatch(gpair, span1, gmat, hist, read_by_column); // no prefetching to avoid loading extra memory - BuildHistDispatch(gpair, span2, gmat, hist); + BuildHistDispatch(gpair, span2, gmat, hist, read_by_column); } } +template +void GHistBuilder::BuildHist(const std::vector &gpair, + const RowSetCollection::Elem row_indices, + const GHistIndexMatrix &gmat, + GHistRow hist, bool force_read_by_column) const { + /* force_read_by_column is used for testing the columnwise building of histograms. + * default force_read_by_column = false + */ + constexpr double kAdhocL2Size = 1024 * 1024 * 0.8; + const bool hist_fit_to_l2 = kAdhocL2Size > 2*sizeof(float)*gmat.cut.Ptrs().back(); + const bool read_by_column = !hist_fit_to_l2 && !any_missing; + + BuildHistDispatch(gpair, row_indices, gmat, hist, read_by_column || + force_read_by_column); +} + template void GHistBuilder::BuildHist(const std::vector &gpair, const RowSetCollection::Elem row_indices, - const GHistIndexMatrix &gmat, GHistRow hist) const; + const GHistIndexMatrix &gmat, GHistRow hist, + bool force_read_by_column) const; template void GHistBuilder::BuildHist(const std::vector &gpair, const RowSetCollection::Elem row_indices, - const GHistIndexMatrix &gmat, GHistRow hist) const; + const GHistIndexMatrix &gmat, GHistRow hist, + bool force_read_by_column) const; } // namespace common } // namespace xgboost diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 9bcc78ba435e..06e1dc22d946 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -623,7 +623,8 @@ class GHistBuilder { // construct a histogram via histogram aggregation template void BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, GHistRow hist) const; + const GHistIndexMatrix& gmat, GHistRow hist, + bool force_read_by_column = false) const; uint32_t GetNumBins() const { return nbins_; } diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h index 70fad741be73..16f9f5e61c93 100644 --- a/src/tree/hist/histogram.h +++ b/src/tree/hist/histogram.h @@ -59,7 +59,8 @@ class HistogramBuilder { GHistIndexMatrix const &gidx, std::vector const &nodes_for_explicit_hist_build, common::RowSetCollection const &row_set_collection, - const std::vector &gpair_h) { + const std::vector &gpair_h, + bool force_read_by_column) { const size_t n_nodes = nodes_for_explicit_hist_build.size(); CHECK_GT(n_nodes, 0); @@ -86,7 +87,8 @@ class HistogramBuilder { elem.begin + end_of_row_set, nid); auto hist = buffer_.GetInitializedHist(tid, nid_in_set); if (rid_set.Size() != 0) { - builder_.template BuildHist(gpair_h, rid_set, gidx, hist); + builder_.template BuildHist(gpair_h, rid_set, gidx, hist, + force_read_by_column); } }); } @@ -112,7 +114,8 @@ class HistogramBuilder { RegTree *p_tree, common::RowSetCollection const &row_set_collection, std::vector const &nodes_for_explicit_hist_build, std::vector const &nodes_for_subtraction_trick, - std::vector const &gpair) { + std::vector const &gpair, + bool force_read_by_column = false) { int starting_index = std::numeric_limits::max(); int sync_count = 0; if (page_id == 0) { @@ -123,11 +126,13 @@ class HistogramBuilder { if (gidx.IsDense()) { this->BuildLocalHistograms(page_id, space, gidx, nodes_for_explicit_hist_build, - row_set_collection, gpair); + row_set_collection, gpair, + force_read_by_column); } else { this->BuildLocalHistograms(page_id, space, gidx, nodes_for_explicit_hist_build, - row_set_collection, gpair); + row_set_collection, gpair, + force_read_by_column); } CHECK_GE(n_batches_, 1); @@ -148,7 +153,8 @@ class HistogramBuilder { common::RowSetCollection const &row_set_collection, std::vector const &nodes_for_explicit_hist_build, std::vector const &nodes_for_subtraction_trick, - std::vector const &gpair) { + std::vector const &gpair, + bool force_read_by_column = false) { const size_t n_nodes = nodes_for_explicit_hist_build.size(); // create space of size (# rows in each node) common::BlockedSpace2d space( @@ -160,7 +166,7 @@ class HistogramBuilder { 256); this->BuildHist(page_id, space, gidx, p_tree, row_set_collection, nodes_for_explicit_hist_build, nodes_for_subtraction_trick, - gpair); + gpair, force_read_by_column); } void SyncHistogramDistributed( diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc index 37c8597cab53..7e1d285e7af2 100644 --- a/tests/cpp/tree/hist/test_evaluate_splits.cc +++ b/tests/cpp/tree/hist/test_evaluate_splits.cc @@ -12,7 +12,7 @@ namespace xgboost { namespace tree { -void TestEvaluateSplits() { +void TestEvaluateSplits(bool force_read_by_column) { int static constexpr kRows = 8, kCols = 16; auto orig = omp_get_max_threads(); int32_t n_threads = std::min(omp_get_max_threads(), 4); @@ -44,7 +44,7 @@ void TestEvaluateSplits() { hist.AddHistRow(0); hist.AllocateAllData(); hist_builder.template BuildHist(row_gpairs, row_set_collection[0], - gmat, hist[0]); + gmat, hist[0], force_read_by_column); // Compute total gradient for all data points GradientPairPrecise total_gpair; @@ -84,7 +84,10 @@ void TestEvaluateSplits() { omp_set_num_threads(orig); } -TEST(HistEvaluator, Evaluate) { TestEvaluateSplits(); } +TEST(HistEvaluator, Evaluate) { + TestEvaluateSplits(false); + TestEvaluateSplits(true); +} TEST(HistEvaluator, Apply) { RegTree tree; diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc index 3c0728c38a37..2de71f6b369b 100644 --- a/tests/cpp/tree/hist/test_histogram.cc +++ b/tests/cpp/tree/hist/test_histogram.cc @@ -225,7 +225,7 @@ TEST(CPUHistogram, SyncHist) { TestSyncHist(false); } -void TestBuildHistogram(bool is_distributed) { +void TestBuildHistogram(bool is_distributed, bool force_read_by_column) { size_t constexpr kNRows = 8, kNCols = 16; int32_t constexpr kMaxBins = 4; auto p_fmat = @@ -256,7 +256,7 @@ void TestBuildHistogram(bool is_distributed) { nodes_for_explicit_hist_build.push_back(node); for (auto const &gidx : p_fmat->GetBatches({kMaxBins, 0.5})) { histogram.BuildHist(0, gidx, &tree, row_set_collection, - nodes_for_explicit_hist_build, {}, gpair); + nodes_for_explicit_hist_build, {}, gpair, force_read_by_column); } // Check if number of histogram bins is correct @@ -283,12 +283,15 @@ void TestBuildHistogram(bool is_distributed) { } TEST(CPUHistogram, BuildHist) { - TestBuildHistogram(true); - TestBuildHistogram(false); + TestBuildHistogram(true, false); + TestBuildHistogram(false, false); + TestBuildHistogram(true, true); + TestBuildHistogram(false, true); + } namespace { -void TestHistogramCategorical(size_t n_categories) { +void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) { size_t constexpr kRows = 340; int32_t constexpr kBins = 256; auto x = GenerateRandomCategoricalSingleColumn(kRows, n_categories); @@ -318,7 +321,8 @@ void TestHistogramCategorical(size_t n_categories) { auto total_bins = gidx.cut.TotalBins(); cat_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false); cat_hist.BuildHist(0, gidx, &tree, row_set_collection, - nodes_for_explicit_hist_build, {}, gpair.HostVector()); + nodes_for_explicit_hist_build, {}, gpair.HostVector(), + force_read_by_column); } /** @@ -331,7 +335,8 @@ void TestHistogramCategorical(size_t n_categories) { auto total_bins = gidx.cut.TotalBins(); onehot_hist.Reset(total_bins, {kBins, 0.5}, omp_get_max_threads(), 1, false); onehot_hist.BuildHist(0, gidx, &tree, row_set_collection, nodes_for_explicit_hist_build, {}, - gpair.HostVector()); + gpair.HostVector(), + force_read_by_column); } auto cat = cat_hist.Histogram()[0]; @@ -342,11 +347,14 @@ void TestHistogramCategorical(size_t n_categories) { TEST(CPUHistogram, Categorical) { for (size_t n_categories = 2; n_categories < 8; ++n_categories) { - TestHistogramCategorical(n_categories); + TestHistogramCategorical(n_categories, false); + } + for (size_t n_categories = 2; n_categories < 8; ++n_categories) { + TestHistogramCategorical(n_categories, true); } } namespace { -void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) { +void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx, bool force_read_by_column) { size_t constexpr kEntries = 1 << 16; auto m = CreateSparsePageDMatrix(kEntries, "cache"); @@ -394,7 +402,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) { size_t page_idx{0}; for (auto const &page : m->GetBatches(batch_param)) { multi_build.BuildHist(page_idx, space, page, &tree, rows_set.at(page_idx), nodes, {}, - h_gpair); + h_gpair, force_read_by_column); ++page_idx; } ASSERT_EQ(page_idx, 2); @@ -421,7 +429,7 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) { false, hess); GHistIndexMatrix gmat(concat, {}, cut, batch_param.max_bin, false, std::numeric_limits::quiet_NaN(), common::OmpGetNumThreads(0)); - single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair); + single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair, force_read_by_column); single_page = single_build.Histogram()[0]; } @@ -434,12 +442,15 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) { TEST(CPUHistogram, ExternalMemory) { int32_t constexpr kBins = 256; - TestHistogramExternalMemory(BatchParam{kBins, common::Span{}, false}, true); + TestHistogramExternalMemory(BatchParam{kBins, common::Span{}, false}, true, false); + TestHistogramExternalMemory(BatchParam{kBins, common::Span{}, false}, true, true); float sparse_thresh{0.5}; - TestHistogramExternalMemory({kBins, sparse_thresh}, false); + TestHistogramExternalMemory({kBins, sparse_thresh}, false, false); + TestHistogramExternalMemory({kBins, sparse_thresh}, false, true); sparse_thresh = std::numeric_limits::quiet_NaN(); - TestHistogramExternalMemory({kBins, sparse_thresh}, false); + TestHistogramExternalMemory({kBins, sparse_thresh}, false, false); + TestHistogramExternalMemory({kBins, sparse_thresh}, false, true); } } // namespace tree