From 2dd333b90dc81cf0fac62f737db494278bb75415 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 May 2022 12:23:12 +0800 Subject: [PATCH 1/5] Remove redundant init method. --- src/data/gradient_index.cc | 86 +++++++++++++-------------- src/data/gradient_index.h | 6 +- tests/cpp/data/test_gradient_index.cc | 4 +- tests/cpp/tree/hist/test_histogram.cc | 5 +- 4 files changed, 46 insertions(+), 55 deletions(-) diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index 791bb47e7105..6ce1a3cb94e7 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -17,10 +17,48 @@ namespace xgboost { GHistIndexMatrix::GHistIndexMatrix() : columns_{std::make_unique()} {} -GHistIndexMatrix::GHistIndexMatrix(DMatrix *x, int32_t max_bin, double sparse_thresh, - bool sorted_sketch, int32_t n_threads, +GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat, + double sparse_thresh, bool sorted_sketch, int32_t n_threads, common::Span hess) { - this->Init(x, max_bin, sparse_thresh, sorted_sketch, n_threads, hess); + CHECK(p_fmat->SingleColBlock()); + // We use sorted sketching for approx tree method since it's more efficient in + // computation time (but higher memory usage). + cut = common::SketchOnDMatrix(p_fmat, max_bins_per_feat, n_threads, sorted_sketch, hess); + + max_num_bins = max_bins_per_feat; + const uint32_t nbins = cut.Ptrs().back(); + hit_count.resize(nbins, 0); + hit_count_tloc_.resize(n_threads * nbins, 0); + + size_t new_size = 1; + for (const auto &batch : p_fmat->GetBatches()) { + new_size += batch.Size(); + } + + row_ptr.resize(new_size); + row_ptr[0] = 0; + + size_t rbegin = 0; + size_t prev_sum = 0; + const bool isDense = p_fmat->IsDense(); + this->isDense_ = isDense; + auto ft = p_fmat->Info().feature_types.ConstHostSpan(); + + for (const auto &batch : p_fmat->GetBatches()) { + this->PushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads); + prev_sum = row_ptr[rbegin + batch.Size()]; + rbegin += batch.Size(); + } + this->columns_ = std::make_unique(); + + // hessian is empty when hist tree method is used or when dataset is empty + if (hess.empty() && !std::isnan(sparse_thresh)) { + // hist + CHECK(!sorted_sketch); + for (auto const &page : p_fmat->GetBatches()) { + this->columns_->Init(page, *this, sparse_thresh, n_threads); + } + } } GHistIndexMatrix::~GHistIndexMatrix() = default; @@ -89,48 +127,6 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, }); } -void GHistIndexMatrix::Init(DMatrix *p_fmat, int max_bins, double sparse_thresh, bool sorted_sketch, - int32_t n_threads, common::Span hess) { - // We use sorted sketching for approx tree method since it's more efficient in - // computation time (but higher memory usage). - cut = common::SketchOnDMatrix(p_fmat, max_bins, n_threads, sorted_sketch, hess); - - max_num_bins = max_bins; - const uint32_t nbins = cut.Ptrs().back(); - hit_count.resize(nbins, 0); - hit_count_tloc_.resize(n_threads * nbins, 0); - - size_t new_size = 1; - for (const auto &batch : p_fmat->GetBatches()) { - new_size += batch.Size(); - } - - row_ptr.resize(new_size); - row_ptr[0] = 0; - - size_t rbegin = 0; - size_t prev_sum = 0; - const bool isDense = p_fmat->IsDense(); - this->isDense_ = isDense; - auto ft = p_fmat->Info().feature_types.ConstHostSpan(); - - for (const auto &batch : p_fmat->GetBatches()) { - this->PushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads); - prev_sum = row_ptr[rbegin + batch.Size()]; - rbegin += batch.Size(); - } - this->columns_ = std::make_unique(); - - // hessian is empty when hist tree method is used or when dataset is empty - if (hess.empty() && !std::isnan(sparse_thresh)) { - // hist - CHECK(!sorted_sketch); - for (auto const &page : p_fmat->GetBatches()) { - this->columns_->Init(page, *this, sparse_thresh, n_threads); - } - } -} - void GHistIndexMatrix::Init(SparsePage const &batch, common::Span ft, common::HistogramCuts const &cuts, int32_t max_bins_per_feat, bool isDense, double sparse_thresh, int32_t n_threads) { diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h index 3d179e0fd3ad..dac6ca38d5ae 100644 --- a/src/data/gradient_index.h +++ b/src/data/gradient_index.h @@ -50,13 +50,11 @@ class GHistIndexMatrix { size_t base_rowid{0}; GHistIndexMatrix(); - GHistIndexMatrix(DMatrix* x, int32_t max_bin, double sparse_thresh, bool sorted_sketch, - int32_t n_threads, common::Span hess = {}); + GHistIndexMatrix(DMatrix* x, bst_bin_t max_bins_per_feat, double sparse_thresh, + bool sorted_sketch, int32_t n_threads, common::Span hess = {}); ~GHistIndexMatrix(); // Create a global histogram matrix, given cut - void Init(DMatrix* p_fmat, int max_bins, double sparse_thresh, bool sorted_sketch, - int32_t n_threads, common::Span hess); void Init(SparsePage const& page, common::Span ft, common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense, double sparse_thresh, int32_t n_threads); diff --git a/tests/cpp/data/test_gradient_index.cc b/tests/cpp/data/test_gradient_index.cc index 6bf12a060dcd..8f5d7d3d759d 100644 --- a/tests/cpp/data/test_gradient_index.cc +++ b/tests/cpp/data/test_gradient_index.cc @@ -44,9 +44,7 @@ TEST(GradientIndex, FromCategoricalBasic) { h_ft.resize(kCols, FeatureType::kCategorical); BatchParam p(max_bins, 0.8); - GHistIndexMatrix gidx; - - gidx.Init(m.get(), max_bins, p.sparse_thresh, false, common::OmpGetNumThreads(0), {}); + GHistIndexMatrix gidx(m.get(), max_bins, p.sparse_thresh, false, common::OmpGetNumThreads(0), {}); auto x_copy = x; std::sort(x_copy.begin(), x_copy.end()); diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc index c0bd62629899..a72b66e49287 100644 --- a/tests/cpp/tree/hist/test_histogram.cc +++ b/tests/cpp/tree/hist/test_histogram.cc @@ -413,10 +413,9 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) { single_build.Reset(total_bins, batch_param, common::OmpGetNumThreads(0), 1, false); SparsePage concat; - GHistIndexMatrix gmat; std::vector hess(m->Info().num_row_, 1.0f); - gmat.Init(m.get(), batch_param.max_bin, std::numeric_limits::quiet_NaN(), false, - common::OmpGetNumThreads(0), hess); + GHistIndexMatrix gmat(m.get(), batch_param.max_bin, std::numeric_limits::quiet_NaN(), + false, common::OmpGetNumThreads(0), hess); single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair); single_page = single_build.Histogram()[0]; } From 8d66598c4ce1b4605418f705c2be34b9a89cdb08 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 May 2022 12:26:50 +0800 Subject: [PATCH 2/5] Remove old external memory code. --- src/data/gradient_index.cc | 29 ++++++++++------------------- src/data/gradient_index.h | 17 ++++++++--------- 2 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index 6ce1a3cb94e7..c2a61093a25c 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -38,16 +38,12 @@ GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat, row_ptr.resize(new_size); row_ptr[0] = 0; - size_t rbegin = 0; - size_t prev_sum = 0; const bool isDense = p_fmat->IsDense(); this->isDense_ = isDense; auto ft = p_fmat->Info().feature_types.ConstHostSpan(); for (const auto &batch : p_fmat->GetBatches()) { - this->PushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads); - prev_sum = row_ptr[rbegin + batch.Size()]; - rbegin += batch.Size(); + this->PushBatch(batch, ft, nbins, n_threads); } this->columns_ = std::make_unique(); @@ -63,19 +59,17 @@ GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat, GHistIndexMatrix::~GHistIndexMatrix() = default; -void GHistIndexMatrix::PushBatch(SparsePage const &batch, - common::Span ft, - size_t rbegin, size_t prev_sum, uint32_t nbins, - int32_t n_threads) { +void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span ft, + uint32_t nbins, int32_t n_threads) { auto page = batch.GetView(); auto it = common::MakeIndexTransformIter([&](size_t ridx) { return page[ridx].size(); }); - common::PartialSum(n_threads, it, it + page.Size(), prev_sum, row_ptr.begin() + rbegin); + common::PartialSum(n_threads, it, it + page.Size(), static_cast(0), row_ptr.begin()); // The number of threads is pegged to the batch size. If the OMP block is parallelized // on anything other than the batch/block size, it should be reassigned const size_t batch_threads = std::max(static_cast(1), std::min(batch.Size(), static_cast(n_threads))); - const size_t n_index = row_ptr[rbegin + batch.Size()]; // number of entries in this page + const size_t n_index = row_ptr[batch.Size()]; // number of entries in this page ResizeIndex(n_index, isDense_); CHECK_GT(cut.Values().size(), 0U); @@ -92,20 +86,20 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::BinTypeSize curent_bin_size = index.GetBinTypeSize(); if (curent_bin_size == common::kUint8BinsTypeSize) { common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, nbins, [offsets](auto bin_idx, auto fidx) { return static_cast(bin_idx - offsets[fidx]); }); } else if (curent_bin_size == common::kUint16BinsTypeSize) { common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, nbins, [offsets](auto bin_idx, auto fidx) { return static_cast(bin_idx - offsets[fidx]); }); } else { CHECK_EQ(curent_bin_size, common::kUint32BinsTypeSize); common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, nbins, [offsets](auto bin_idx, auto fidx) { return static_cast(bin_idx - offsets[fidx]); }); @@ -115,7 +109,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, in index field to chose right offset. So offset is nullptr and index is not reduced */ common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, rbegin, nbins, + SetIndexData(index_data_span, ft, batch_threads, batch, nbins, [](auto idx, auto) { return idx; }); } @@ -144,10 +138,7 @@ void GHistIndexMatrix::Init(SparsePage const &batch, common::SpanPushBatch(batch, ft, rbegin, prev_sum, nbins, n_threads); + this->PushBatch(batch, ft, nbins, n_threads); this->columns_ = std::make_unique(); if (!std::isnan(sparse_thresh)) { this->columns_->Init(batch, *this, sparse_thresh, n_threads); diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h index dac6ca38d5ae..0017f9fc8612 100644 --- a/src/data/gradient_index.h +++ b/src/data/gradient_index.h @@ -32,8 +32,8 @@ class GHistIndexMatrix { * \param rbegin The beginning row index of current page. (total rows in previous pages) * \param prev_sum Total number of entries in previous pages. */ - void PushBatch(SparsePage const& batch, common::Span ft, size_t rbegin, - size_t prev_sum, uint32_t nbins, int32_t n_threads); + void PushBatch(SparsePage const& batch, common::Span ft, uint32_t nbins, + int32_t n_threads); public: /*! \brief row pointer to rows by element position */ @@ -54,17 +54,16 @@ class GHistIndexMatrix { bool sorted_sketch, int32_t n_threads, common::Span hess = {}); ~GHistIndexMatrix(); - // Create a global histogram matrix, given cut + // Create a global histogram matrix, given cut. Used by external memory void Init(SparsePage const& page, common::Span ft, common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense, double sparse_thresh, int32_t n_threads); // specific method for sparse data as no possibility to reduce allocated memory template - void SetIndexData(common::Span index_data_span, - common::Span ft, - size_t batch_threads, const SparsePage &batch, - size_t rbegin, size_t nbins, GetOffset get_offset) { + void SetIndexData(common::Span index_data_span, common::Span ft, + size_t batch_threads, const SparsePage& batch, size_t nbins, + GetOffset get_offset) { const xgboost::Entry *data_ptr = batch.data.HostVector().data(); const std::vector &offset_vec = batch.offset.HostVector(); const size_t batch_size = batch.Size(); @@ -74,8 +73,8 @@ class GHistIndexMatrix { auto const& values = cut.Values(); common::ParallelFor(batch_size, batch_threads, [&](omp_ulong ridx) { const int tid = omp_get_thread_num(); - size_t ibegin = row_ptr[rbegin + ridx]; // index of first entry for current block - size_t iend = row_ptr[rbegin + ridx + 1]; // first entry for next block + size_t ibegin = row_ptr[ridx]; // index of first entry for current block + size_t iend = row_ptr[ridx + 1]; // first entry for next block const size_t size = offset_vec[ridx + 1] - offset_vec[ridx]; SparsePage::Inst inst = {data_ptr + offset_vec[ridx], size}; CHECK_EQ(ibegin + inst.size(), iend); From 8b321233e9ae31428b853c250354370bba3d04f2 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 May 2022 12:37:30 +0800 Subject: [PATCH 3/5] Use adapter function. --- src/common/hist_util.h | 15 ++++----- src/data/gradient_index.cc | 41 +++++++++------------- src/data/gradient_index.h | 69 ++++++++++++++++++-------------------- 3 files changed, 55 insertions(+), 70 deletions(-) diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 6671f05e3b69..9a87e5222ae2 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -125,26 +125,25 @@ class HistogramCuts { /** * \brief Search the bin index for numerical feature. */ - bst_bin_t SearchBin(Entry const& e) const { - return SearchBin(e.fvalue, e.index); - } + bst_bin_t SearchBin(Entry const& e) const { return SearchBin(e.fvalue, e.index); } /** * \brief Search the bin index for categorical feature. */ - bst_bin_t SearchCatBin(Entry const &e) const { + bst_bin_t SearchCatBin(float value, bst_feature_t fidx) const { auto const &ptrs = this->Ptrs(); auto const &vals = this->Values(); - auto end = ptrs.at(e.index + 1) + vals.cbegin(); - auto beg = ptrs[e.index] + vals.cbegin(); + auto end = ptrs.at(fidx + 1) + vals.cbegin(); + auto beg = ptrs[fidx] + vals.cbegin(); // Truncates the value in case it's not perfectly rounded. - auto v = static_cast(common::AsCat(e.fvalue)); + auto v = static_cast(common::AsCat(value)); auto bin_idx = std::lower_bound(beg, end, v) - vals.cbegin(); - if (bin_idx == ptrs.at(e.index + 1)) { + if (bin_idx == ptrs.at(fidx + 1)) { bin_idx -= 1; } return bin_idx; } + bst_bin_t SearchCatBin(Entry const& e) const { return SearchCatBin(e.fvalue, e.index); } }; /** diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index c2a61093a25c..8bb1a16035a7 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -60,7 +60,7 @@ GHistIndexMatrix::GHistIndexMatrix(DMatrix *p_fmat, bst_bin_t max_bins_per_feat, GHistIndexMatrix::~GHistIndexMatrix() = default; void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span ft, - uint32_t nbins, int32_t n_threads) { + bst_bin_t n_total_bins, int32_t n_threads) { auto page = batch.GetView(); auto it = common::MakeIndexTransformIter([&](size_t ridx) { return page[ridx].size(); }); common::PartialSum(n_threads, it, it + page.Size(), static_cast(0), row_ptr.begin()); @@ -79,44 +79,33 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, nbins, - [offsets](auto bin_idx, auto fidx) { - return static_cast(bin_idx - offsets[fidx]); - }); - } else if (curent_bin_size == common::kUint16BinsTypeSize) { - common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, nbins, - [offsets](auto bin_idx, auto fidx) { - return static_cast(bin_idx - offsets[fidx]); - }); - } else { - CHECK_EQ(curent_bin_size, common::kUint32BinsTypeSize); - common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, nbins, - [offsets](auto bin_idx, auto fidx) { - return static_cast(bin_idx - offsets[fidx]); - }); - } + common::DispatchBinType(index.GetBinTypeSize(), [&](auto dtype) { + using T = decltype(dtype); + common::Span index_data_span = {index.data(), index.Size()}; + SetIndexData( + index_data_span, {}, batch_threads, adapter_batch, is_valid, n_bins_total, + [offsets](auto bin_idx, auto fidx) { return static_cast(bin_idx - offsets[fidx]); }); + }); } else { /* For sparse DMatrix we have to store index of feature for each bin in index field to chose right offset. So offset is nullptr and index is not reduced */ common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, ft, batch_threads, batch, nbins, + SetIndexData(index_data_span, {}, batch_threads, adapter_batch, is_valid, n_bins_total, [](auto idx, auto) { return idx; }); } - common::ParallelFor(nbins, n_threads, [&](bst_omp_uint idx) { + common::ParallelFor(n_total_bins, n_threads, [&](bst_omp_uint idx) { for (int32_t tid = 0; tid < n_threads; ++tid) { - hit_count[idx] += hit_count_tloc_[tid * nbins + idx]; - hit_count_tloc_[tid * nbins + idx] = 0; // reset for next batch + hit_count[idx] += hit_count_tloc_[tid * n_total_bins + idx]; + hit_count_tloc_[tid * n_total_bins + idx] = 0; // reset for next batch } }); } diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h index 0017f9fc8612..7074a3d9dee4 100644 --- a/src/data/gradient_index.h +++ b/src/data/gradient_index.h @@ -10,6 +10,7 @@ #include "../common/categorical.h" #include "../common/hist_util.h" #include "../common/threading_utils.h" +#include "adapter.h" #include "xgboost/base.h" #include "xgboost/data.h" @@ -32,8 +33,38 @@ class GHistIndexMatrix { * \param rbegin The beginning row index of current page. (total rows in previous pages) * \param prev_sum Total number of entries in previous pages. */ - void PushBatch(SparsePage const& batch, common::Span ft, uint32_t nbins, - int32_t n_threads); + void PushBatch(SparsePage const& batch, common::Span ft, + bst_bin_t n_total_bins, int32_t n_threads); + + template + void SetIndexData(common::Span index_data_span, common::Span ft, + size_t batch_threads, Batch const& batch, IsValid&& is_valid, size_t nbins, + GetOffset&& get_offset) { + auto batch_size = batch.Size(); + BinIdxType* index_data = index_data_span.data(); + auto const& ptrs = cut.Ptrs(); + auto const& values = cut.Values(); + common::ParallelFor(batch_size, batch_threads, [&](size_t i) { + auto line = batch.GetLine(i); + size_t ibegin = row_ptr[i]; // index of first entry for current block + size_t k = 0; + auto tid = omp_get_thread_num(); + for (size_t j = 0; j < line.Size(); ++j) { + data::COOTuple elem = line.GetElement(j); + if (is_valid(elem)) { + bst_bin_t bin_idx{-1}; + if (common::IsCat(ft, elem.column_idx)) { + bin_idx = cut.SearchCatBin(elem.value, elem.column_idx); + } else { + bin_idx = cut.SearchBin(elem.value, elem.column_idx, ptrs, values); + } + index_data[ibegin + k] = get_offset(bin_idx, j); + ++hit_count_tloc_[tid * nbins + bin_idx]; + ++k; + } + } + }); + } public: /*! \brief row pointer to rows by element position */ @@ -59,40 +90,6 @@ class GHistIndexMatrix { common::HistogramCuts const& cuts, int32_t max_bins_per_feat, bool is_dense, double sparse_thresh, int32_t n_threads); - // specific method for sparse data as no possibility to reduce allocated memory - template - void SetIndexData(common::Span index_data_span, common::Span ft, - size_t batch_threads, const SparsePage& batch, size_t nbins, - GetOffset get_offset) { - const xgboost::Entry *data_ptr = batch.data.HostVector().data(); - const std::vector &offset_vec = batch.offset.HostVector(); - const size_t batch_size = batch.Size(); - CHECK_LT(batch_size, offset_vec.size()); - BinIdxType* index_data = index_data_span.data(); - auto const& ptrs = cut.Ptrs(); - auto const& values = cut.Values(); - common::ParallelFor(batch_size, batch_threads, [&](omp_ulong ridx) { - const int tid = omp_get_thread_num(); - size_t ibegin = row_ptr[ridx]; // index of first entry for current block - size_t iend = row_ptr[ridx + 1]; // first entry for next block - const size_t size = offset_vec[ridx + 1] - offset_vec[ridx]; - SparsePage::Inst inst = {data_ptr + offset_vec[ridx], size}; - CHECK_EQ(ibegin + inst.size(), iend); - for (bst_uint j = 0; j < inst.size(); ++j) { - auto e = inst[j]; - if (common::IsCat(ft, e.index)) { - bst_bin_t bin_idx = cut.SearchCatBin(e); - index_data[ibegin + j] = get_offset(bin_idx, j); - ++hit_count_tloc_[tid * nbins + bin_idx]; - } else { - bst_bin_t bin_idx = cut.SearchBin(e.fvalue, e.index, ptrs, values); - index_data[ibegin + j] = get_offset(bin_idx, j); - ++hit_count_tloc_[tid * nbins + bin_idx]; - } - } - }); - } - void ResizeIndex(const size_t n_index, const bool isDense); void GetFeatureCounts(size_t* counts) const { From 58fba4e143515208803d2103d911624b34dc7a3a Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 May 2022 12:53:06 +0800 Subject: [PATCH 4/5] Remove old tests. --- src/data/gradient_index.cc | 4 ++-- tests/cpp/common/test_column_matrix.cc | 17 ----------------- 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index 8bb1a16035a7..6f8c5ee9f44e 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -90,7 +90,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span index_data_span = {index.data(), index.Size()}; SetIndexData( - index_data_span, {}, batch_threads, adapter_batch, is_valid, n_bins_total, + index_data_span, ft, batch_threads, adapter_batch, is_valid, n_bins_total, [offsets](auto bin_idx, auto fidx) { return static_cast(bin_idx - offsets[fidx]); }); }); } else { @@ -98,7 +98,7 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span index_data_span = {index.data(), n_index}; - SetIndexData(index_data_span, {}, batch_threads, adapter_batch, is_valid, n_bins_total, + SetIndexData(index_data_span, ft, batch_threads, adapter_batch, is_valid, n_bins_total, [](auto idx, auto) { return idx; }); } diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc index 4b6b0e91d8d1..cdd38468a25c 100644 --- a/tests/cpp/common/test_column_matrix.cc +++ b/tests/cpp/common/test_column_matrix.cc @@ -107,22 +107,5 @@ TEST(DenseColumnWithMissing, Test) { }); } } - -void TestGHistIndexMatrixCreation(size_t nthreads) { - size_t constexpr kPageSize = 1024, kEntriesPerCol = 3; - size_t constexpr kEntries = kPageSize * kEntriesPerCol * 2; - /* This should create multiple sparse pages */ - std::unique_ptr dmat{CreateSparsePageDMatrix(kEntries)}; - GHistIndexMatrix gmat(dmat.get(), 256, 0.5f, false, common::OmpGetNumThreads(nthreads)); -} - -TEST(HistIndexCreationWithExternalMemory, Test) { - // Vary the number of threads to make sure that the last batch - // is distributed properly to the available number of threads - // in the thread pool - TestGHistIndexMatrixCreation(20); - TestGHistIndexMatrixCreation(30); - TestGHistIndexMatrixCreation(40); -} } // namespace common } // namespace xgboost From eac8b0462a9aaeded91b3bfed09c74ba547b6f9d Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 20 May 2022 13:09:03 +0800 Subject: [PATCH 5/5] Revise histogram test. --- tests/cpp/tree/hist/test_histogram.cc | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc index a72b66e49287..d20c3d0d8bbd 100644 --- a/tests/cpp/tree/hist/test_histogram.cc +++ b/tests/cpp/tree/hist/test_histogram.cc @@ -414,8 +414,15 @@ void TestHistogramExternalMemory(BatchParam batch_param, bool is_approx) { single_build.Reset(total_bins, batch_param, common::OmpGetNumThreads(0), 1, false); SparsePage concat; std::vector hess(m->Info().num_row_, 1.0f); - GHistIndexMatrix gmat(m.get(), batch_param.max_bin, std::numeric_limits::quiet_NaN(), - false, common::OmpGetNumThreads(0), hess); + for (auto const& page : m->GetBatches()) { + concat.Push(page); + } + + auto cut = common::SketchOnDMatrix(m.get(), batch_param.max_bin, common::OmpGetNumThreads(0), + false, hess); + GHistIndexMatrix gmat; + gmat.Init(concat, {}, cut, batch_param.max_bin, false, std::numeric_limits::quiet_NaN(), + common::OmpGetNumThreads(0)); single_build.BuildHist(0, gmat, &tree, row_set_collection, nodes, {}, h_gpair); single_page = single_build.Histogram()[0]; }