From 2567404ab601149c52b66c3a02d85e9571a84447 Mon Sep 17 00:00:00 2001 From: ShvetsKS <33296480+ShvetsKS@users.noreply.github.com> Date: Fri, 11 Jun 2021 13:26:30 +0300 Subject: [PATCH] Simplify sparse and dense CPU hist kernels (#7029) * Simplify sparse and dense kernels * Extract row partitioner. Co-authored-by: Kirill Shvets --- src/common/column_matrix.h | 46 ++++- src/common/hist_util.cc | 131 +++++------- src/common/hist_util.h | 4 +- src/common/partition_builder.h | 228 +++++++++++++++++++++ src/common/row_set.h | 124 ----------- src/tree/updater_quantile_hist.cc | 214 +++---------------- src/tree/updater_quantile_hist.h | 20 +- tests/cpp/common/test_column_matrix.cc | 20 +- tests/cpp/common/test_partition_builder.cc | 1 + tests/cpp/tree/test_quantile_hist.cc | 13 +- 10 files changed, 368 insertions(+), 433 deletions(-) create mode 100644 src/common/partition_builder.h diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index aeb0f309af67..56cc89c05998 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -30,6 +30,8 @@ enum ColumnType { template class Column { public: + static constexpr int32_t kMissingId = -1; + Column(ColumnType type, common::Span index, const uint32_t index_base) : type_(type), index_(index), @@ -71,6 +73,30 @@ class SparseColumn: public Column { const size_t* GetRowData() const { return row_ind_.data(); } + int32_t GetBinIdx(size_t rid, size_t* state) const { + const size_t column_size = this->Size(); + if (!((*state) < column_size)) { + return this->kMissingId; + } + while ((*state) < column_size && GetRowIdx(*state) < rid) { + ++(*state); + } + if (((*state) < column_size) && GetRowIdx(*state) == rid) { + return this->GetGlobalBinIdx(*state); + } else { + return this->kMissingId; + } + } + + size_t GetInitialState(const size_t first_row_id) const { + const size_t* row_data = GetRowData(); + const size_t column_size = this->Size(); + // search first nonzero row with index >= rid_span.front() + const size_t* p = std::lower_bound(row_data, row_data + column_size, first_row_id); + // column_size if all messing + return p - row_data; + } + size_t GetRowIdx(size_t idx) const { return row_ind_.data()[idx]; } @@ -80,7 +106,7 @@ class SparseColumn: public Column { common::Span row_ind_; }; -template +template class DenseColumn: public Column { public: DenseColumn(ColumnType type, common::Span index, @@ -90,6 +116,19 @@ class DenseColumn: public Column { missing_flags_(missing_flags), feature_offset_(feature_offset) {} bool IsMissing(size_t idx) const { return missing_flags_[feature_offset_ + idx]; } + + int32_t GetBinIdx(size_t idx, size_t* state) const { + if (any_missing) { + return IsMissing(idx) ? this->kMissingId : this->GetGlobalBinIdx(idx); + } else { + return this->GetGlobalBinIdx(idx); + } + } + + size_t GetInitialState(const size_t first_row_id) const { + return 0; + } + private: /* flags for missing values in dense columns */ const std::vector& missing_flags_; @@ -202,7 +241,7 @@ class ColumnMatrix { /* Fetch an individual column. This code should be used with type swith to determine type of bin id's */ - template + template std::unique_ptr > GetColumn(unsigned fid) const { CHECK_EQ(sizeof(BinIdxType), bins_type_size_); @@ -213,7 +252,8 @@ class ColumnMatrix { column_size }; std::unique_ptr > res; if (type_[fid] == ColumnType::kDenseColumn) { - res.reset(new DenseColumn(type_[fid], bin_index, index_base_[fid], + CHECK_EQ(any_missing, any_missing_); + res.reset(new DenseColumn(type_[fid], bin_index, index_base_[fid], missing_flags_, feature_offset)); } else { res.reset(new SparseColumn(type_[fid], bin_index, index_base_[fid], diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index d65d19f998cb..edb22b613c26 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -287,17 +287,18 @@ struct Prefetch { constexpr size_t Prefetch::kNoPrefetchSize; -template -void BuildHistDenseKernel(const std::vector& gpair, +template +void BuildHistKernel(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, - const size_t n_features, GHistRow hist) { const size_t size = row_indices.Size(); const size_t* rid = row_indices.begin; const float* pgh = reinterpret_cast(gpair.data()); const BinIdxType* gradient_index = gmat.index.data(); + const size_t* row_ptr = gmat.row_ptr.data(); const uint32_t* offsets = gmat.index.Offset(); + const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]]; FPType* hist_data = reinterpret_cast(hist.data()); const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains // 2 FP values: gradient and hessian. @@ -305,101 +306,51 @@ void BuildHistDenseKernel(const std::vector& gpair, // to work with gradient pairs as a singe row FP array for (size_t i = 0; i < size; ++i) { - const size_t icol_start = rid[i] * n_features; + const size_t icol_start = any_missing ? row_ptr[rid[i]] : rid[i] * n_features; + const size_t icol_end = any_missing ? row_ptr[rid[i]+1] : icol_start + n_features; + const size_t row_size = icol_end - icol_start; const size_t idx_gh = two * rid[i]; if (do_prefetch) { - const size_t icol_start_prefetch = rid[i + Prefetch::kPrefetchOffset] * n_features; + const size_t icol_start_prftch = any_missing ? row_ptr[rid[i+Prefetch::kPrefetchOffset]] : + rid[i + Prefetch::kPrefetchOffset] * n_features; + const size_t icol_end_prefect = any_missing ? row_ptr[rid[i+Prefetch::kPrefetchOffset]+1] : + icol_start_prftch + n_features; PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]); - for (size_t j = icol_start_prefetch; j < icol_start_prefetch + n_features; - j += Prefetch::GetPrefetchStep()) { + for (size_t j = icol_start_prftch; j < icol_end_prefect; + j+=Prefetch::GetPrefetchStep()) { PREFETCH_READ_T0(gradient_index + j); } } const BinIdxType* gr_index_local = gradient_index + icol_start; - for (size_t j = 0; j < n_features; ++j) { - const uint32_t idx_bin = two * (static_cast(gr_index_local[j]) + - offsets[j]); - - hist_data[idx_bin] += pgh[idx_gh]; - hist_data[idx_bin+1] += pgh[idx_gh+1]; - } - } -} - -template -void BuildHistSparseKernel(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, - GHistRow hist) { - const size_t size = row_indices.Size(); - const size_t* rid = row_indices.begin; - const float* pgh = reinterpret_cast(gpair.data()); - const uint32_t* gradient_index = gmat.index.data(); - const size_t* row_ptr = gmat.row_ptr.data(); - FPType* hist_data = reinterpret_cast(hist.data()); - const uint32_t two {2}; // Each element from 'gpair' and 'hist' contains - // 2 FP values: gradient and hessian. - // So we need to multiply each row-index/bin-index by 2 - // to work with gradient pairs as a singe row FP array - for (size_t i = 0; i < size; ++i) { - const size_t icol_start = row_ptr[rid[i]]; - const size_t icol_end = row_ptr[rid[i]+1]; - const size_t idx_gh = two * rid[i]; + for (size_t j = 0; j < row_size; ++j) { + const uint32_t idx_bin = two * (static_cast(gr_index_local[j]) + ( + any_missing ? 0 : offsets[j])); - if (do_prefetch) { - const size_t icol_start_prftch = row_ptr[rid[i+Prefetch::kPrefetchOffset]]; - const size_t icol_end_prefect = row_ptr[rid[i+Prefetch::kPrefetchOffset]+1]; - - PREFETCH_READ_T0(pgh + two * rid[i + Prefetch::kPrefetchOffset]); - for (size_t j = icol_start_prftch; j < icol_end_prefect; - j+=Prefetch::GetPrefetchStep()) { - PREFETCH_READ_T0(gradient_index + j); - } - } - for (size_t j = icol_start; j < icol_end; ++j) { - const uint32_t idx_bin = two * gradient_index[j]; hist_data[idx_bin] += pgh[idx_gh]; hist_data[idx_bin+1] += pgh[idx_gh+1]; } } } - -template -void BuildHistDispatchKernel(const std::vector& gpair, +template +void BuildHistDispatch(const std::vector& gpair, const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, GHistRow hist, bool isDense) { - if (isDense) { - const size_t* row_ptr = gmat.row_ptr.data(); - const size_t n_features = row_ptr[row_indices.begin[0]+1] - row_ptr[row_indices.begin[0]]; - BuildHistDenseKernel(gpair, row_indices, - gmat, n_features, hist); - } else { - BuildHistSparseKernel(gpair, row_indices, - gmat, hist); - } -} - -template -void BuildHistKernel(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, const bool isDense, GHistRow hist) { - const bool is_dense = row_indices.Size() && isDense; + const GHistIndexMatrix& gmat, GHistRow hist) { switch (gmat.index.GetBinTypeSize()) { case kUint8BinsTypeSize: - BuildHistDispatchKernel(gpair, row_indices, - gmat, hist, is_dense); + BuildHistKernel(gpair, row_indices, + gmat, hist); break; case kUint16BinsTypeSize: - BuildHistDispatchKernel(gpair, row_indices, - gmat, hist, is_dense); + BuildHistKernel(gpair, row_indices, + gmat, hist); break; case kUint32BinsTypeSize: - BuildHistDispatchKernel(gpair, row_indices, - gmat, hist, is_dense); + BuildHistKernel(gpair, row_indices, + gmat, hist); break; default: CHECK(false); // no default behavior @@ -407,10 +358,12 @@ void BuildHistKernel(const std::vector& gpair, } template +template void GHistBuilder::BuildHist( const std::vector &gpair, - const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat, - GHistRowT hist, bool isDense) { + const RowSetCollection::Elem row_indices, + const GHistIndexMatrix &gmat, + GHistRowT hist) { const size_t nrows = row_indices.Size(); const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows); @@ -419,28 +372,36 @@ void GHistBuilder::BuildHist( if (contiguousBlock) { // contiguous memory access, built-in HW prefetching is enough - BuildHistKernel(gpair, row_indices, gmat, isDense, hist); + BuildHistDispatch(gpair, row_indices, gmat, hist); } else { const RowSetCollection::Elem span1(row_indices.begin, row_indices.end - no_prefetch_size); const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size, row_indices.end); - BuildHistKernel(gpair, span1, gmat, isDense, hist); + BuildHistDispatch(gpair, span1, gmat, hist); // no prefetching to avoid loading extra memory - BuildHistKernel(gpair, span2, gmat, isDense, hist); + BuildHistDispatch(gpair, span2, gmat, hist); } } template -void GHistBuilder::BuildHist(const std::vector& gpair, +void GHistBuilder::BuildHist(const std::vector& gpair, + const RowSetCollection::Elem row_indices, + const GHistIndexMatrix& gmat, + GHistRow hist); +template +void GHistBuilder::BuildHist(const std::vector& gpair, + const RowSetCollection::Elem row_indices, + const GHistIndexMatrix& gmat, + GHistRow hist); +template +void GHistBuilder::BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, - GHistRow hist, - bool isDense); + GHistRow hist); template -void GHistBuilder::BuildHist(const std::vector& gpair, +void GHistBuilder::BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, - GHistRow hist, - bool isDense); + GHistRow hist); template void GHistBuilder::SubtractionTrick(GHistRowT self, diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 62a1db9562fe..b4af15ae147e 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -627,11 +627,11 @@ class GHistBuilder { GHistBuilder(size_t nthread, uint32_t nbins) : nthread_{nthread}, nbins_{nbins} {} // construct a histogram via histogram aggregation + template void BuildHist(const std::vector& gpair, const RowSetCollection::Elem row_indices, const GHistIndexMatrix& gmat, - GHistRowT hist, - bool isDense); + GHistRowT hist); // construct a histogram via subtraction trick void SubtractionTrick(GHistRowT self, GHistRowT sibling, diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h new file mode 100644 index 000000000000..23c620a4e897 --- /dev/null +++ b/src/common/partition_builder.h @@ -0,0 +1,228 @@ + +/*! + * Copyright 2021 by Contributors + * \file row_set.h + * \brief Quick Utility to compute subset of rows + * \author Philip Cho, Tianqi Chen + */ +#ifndef XGBOOST_COMMON_PARTITION_BUILDER_H_ +#define XGBOOST_COMMON_PARTITION_BUILDER_H_ + +#include +#include +#include +#include +#include +#include "xgboost/tree_model.h" +#include "../common/column_matrix.h" + +namespace xgboost { +namespace common { + +// The builder is required for samples partition to left and rights children for set of nodes +// Responsible for: +// 1) Effective memory allocation for intermediate results for multi-thread work +// 2) Merging partial results produced by threads into original row set (row_set_collection_) +// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature +template +class PartitionBuilder { + public: + template + void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks) { + left_right_nodes_sizes_.resize(n_nodes); + blocks_offsets_.resize(n_nodes+1); + + blocks_offsets_[0] = 0; + for (size_t i = 1; i < n_nodes+1; ++i) { + blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTaks(i-1); + } + + if (n_tasks > max_n_tasks_) { + mem_blocks_.resize(n_tasks); + max_n_tasks_ = n_tasks; + } + } + + // split row indexes (rid_span) to 2 parts (left_part, right_part) depending + // on comparison of indexes values (idx_span) and split point (split_cond) + // Handle dense columns + // Analog of std::stable_partition, but in no-inplace manner + template + inline std::pair PartitionKernel(const ColumnType& column, + common::Span rid_span, const int32_t split_cond, + common::Span left_part, common::Span right_part) { + size_t* p_left_part = left_part.data(); + size_t* p_right_part = right_part.data(); + size_t nleft_elems = 0; + size_t nright_elems = 0; + auto state = column.GetInitialState(rid_span.front()); + + for (auto rid : rid_span) { + const int32_t bin_id = column.GetBinIdx(rid, &state); + if (any_missing && bin_id == ColumnType::kMissingId) { + if (default_left) { + p_left_part[nleft_elems++] = rid; + } else { + p_right_part[nright_elems++] = rid; + } + } else { + if (bin_id <= split_cond) { + p_left_part[nleft_elems++] = rid; + } else { + p_right_part[nright_elems++] = rid; + } + } + } + + return {nleft_elems, nright_elems}; + } + + + template + void Partition(const size_t node_in_set, const size_t nid, const common::Range1d range, + const int32_t split_cond, + const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) { + common::Span rid_span(rid + range.begin(), rid + range.end()); + common::Span left = GetLeftBuffer(node_in_set, + range.begin(), range.end()); + common::Span right = GetRightBuffer(node_in_set, + range.begin(), range.end()); + const bst_uint fid = tree[nid].SplitIndex(); + const bool default_left = tree[nid].DefaultLeft(); + const auto column_ptr = column_matrix.GetColumn(fid); + + std::pair child_nodes_sizes; + + if (column_ptr->GetType() == xgboost::common::kDenseColumn) { + const common::DenseColumn& column = + static_cast& >(*(column_ptr.get())); + if (default_left) { + child_nodes_sizes = PartitionKernel(column, rid_span, + split_cond, left, right); + } else { + child_nodes_sizes = PartitionKernel(column, rid_span, + split_cond, left, right); + } + } else { + CHECK_EQ(any_missing, true); + const common::SparseColumn& column + = static_cast& >(*(column_ptr.get())); + if (default_left) { + child_nodes_sizes = PartitionKernel(column, rid_span, + split_cond, left, right); + } else { + child_nodes_sizes = PartitionKernel(column, rid_span, + split_cond, left, right); + } + } + + const size_t n_left = child_nodes_sizes.first; + const size_t n_right = child_nodes_sizes.second; + + SetNLeftElems(node_in_set, range.begin(), range.end(), n_left); + SetNRightElems(node_in_set, range.begin(), range.end(), n_right); + } + + + // allocate thread local memory, should be called for each specific task + void AllocateForTask(size_t id) { + if (mem_blocks_[id].get() == nullptr) { + BlockInfo* local_block_ptr = new BlockInfo; + CHECK_NE(local_block_ptr, (BlockInfo*)nullptr); + mem_blocks_[id].reset(local_block_ptr); + } + } + + common::Span GetLeftBuffer(int nid, size_t begin, size_t end) { + const size_t task_idx = GetTaskIdx(nid, begin); + return { mem_blocks_.at(task_idx)->Left(), end - begin }; + } + + common::Span GetRightBuffer(int nid, size_t begin, size_t end) { + const size_t task_idx = GetTaskIdx(nid, begin); + return { mem_blocks_.at(task_idx)->Right(), end - begin }; + } + + void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) { + size_t task_idx = GetTaskIdx(nid, begin); + mem_blocks_.at(task_idx)->n_left = n_left; + } + + void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) { + size_t task_idx = GetTaskIdx(nid, begin); + mem_blocks_.at(task_idx)->n_right = n_right; + } + + + size_t GetNLeftElems(int nid) const { + return left_right_nodes_sizes_[nid].first; + } + + size_t GetNRightElems(int nid) const { + return left_right_nodes_sizes_[nid].second; + } + + // Each thread has partial results for some set of tree-nodes + // The function decides order of merging partial results into final row set + void CalculateRowOffsets() { + for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) { + size_t n_left = 0; + for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) { + mem_blocks_[j]->n_offset_left = n_left; + n_left += mem_blocks_[j]->n_left; + } + size_t n_right = 0; + for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) { + mem_blocks_[j]->n_offset_right = n_left + n_right; + n_right += mem_blocks_[j]->n_right; + } + left_right_nodes_sizes_[i] = {n_left, n_right}; + } + } + + void MergeToArray(int nid, size_t begin, size_t* rows_indexes) { + size_t task_idx = GetTaskIdx(nid, begin); + + size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left; + size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right; + + const size_t* left = mem_blocks_[task_idx]->Left(); + const size_t* right = mem_blocks_[task_idx]->Right(); + + std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result); + std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result); + } + + size_t GetTaskIdx(int nid, size_t begin) { + return blocks_offsets_[nid] + begin / BlockSize; + } + + protected: + struct BlockInfo{ + size_t n_left; + size_t n_right; + + size_t n_offset_left; + size_t n_offset_right; + + size_t* Left() { + return &left_data_[0]; + } + + size_t* Right() { + return &right_data_[0]; + } + private: + size_t left_data_[BlockSize]; + size_t right_data_[BlockSize]; + }; + std::vector> left_right_nodes_sizes_; + std::vector blocks_offsets_; + std::vector> mem_blocks_; + size_t max_n_tasks_ = 0; +}; + +} // namespace common +} // namespace xgboost + +#endif // XGBOOST_COMMON_PARTITION_BUILDER_H_ diff --git a/src/common/row_set.h b/src/common/row_set.h index 0838b5943be9..b81054435a58 100644 --- a/src/common/row_set.h +++ b/src/common/row_set.h @@ -126,130 +126,6 @@ class RowSetCollection { std::vector elem_of_each_node_; }; - -// The builder is required for samples partition to left and rights children for set of nodes -// Responsible for: -// 1) Effective memory allocation for intermediate results for multi-thread work -// 2) Merging partial results produced by threads into original row set (row_set_collection_) -// BlockSize is template to enable memory alignment easily with C++11 'alignas()' feature -template -class PartitionBuilder { - public: - template - void Init(const size_t n_tasks, size_t n_nodes, Func funcNTaks) { - left_right_nodes_sizes_.resize(n_nodes); - blocks_offsets_.resize(n_nodes+1); - - blocks_offsets_[0] = 0; - for (size_t i = 1; i < n_nodes+1; ++i) { - blocks_offsets_[i] = blocks_offsets_[i-1] + funcNTaks(i-1); - } - - if (n_tasks > max_n_tasks_) { - mem_blocks_.resize(n_tasks); - max_n_tasks_ = n_tasks; - } - } - - // allocate thread local memory, should be called for each specific task - void AllocateForTask(size_t id) { - if (mem_blocks_[id].get() == nullptr) { - BlockInfo* local_block_ptr = new BlockInfo; - CHECK_NE(local_block_ptr, (BlockInfo*)nullptr); - mem_blocks_[id].reset(local_block_ptr); - } - } - - common::Span GetLeftBuffer(int nid, size_t begin, size_t end) { - const size_t task_idx = GetTaskIdx(nid, begin); - return { mem_blocks_.at(task_idx)->Left(), end - begin }; - } - - common::Span GetRightBuffer(int nid, size_t begin, size_t end) { - const size_t task_idx = GetTaskIdx(nid, begin); - return { mem_blocks_.at(task_idx)->Right(), end - begin }; - } - - void SetNLeftElems(int nid, size_t begin, size_t end, size_t n_left) { - size_t task_idx = GetTaskIdx(nid, begin); - mem_blocks_.at(task_idx)->n_left = n_left; - } - - void SetNRightElems(int nid, size_t begin, size_t end, size_t n_right) { - size_t task_idx = GetTaskIdx(nid, begin); - mem_blocks_.at(task_idx)->n_right = n_right; - } - - - size_t GetNLeftElems(int nid) const { - return left_right_nodes_sizes_[nid].first; - } - - size_t GetNRightElems(int nid) const { - return left_right_nodes_sizes_[nid].second; - } - - // Each thread has partial results for some set of tree-nodes - // The function decides order of merging partial results into final row set - void CalculateRowOffsets() { - for (size_t i = 0; i < blocks_offsets_.size()-1; ++i) { - size_t n_left = 0; - for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) { - mem_blocks_[j]->n_offset_left = n_left; - n_left += mem_blocks_[j]->n_left; - } - size_t n_right = 0; - for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) { - mem_blocks_[j]->n_offset_right = n_left + n_right; - n_right += mem_blocks_[j]->n_right; - } - left_right_nodes_sizes_[i] = {n_left, n_right}; - } - } - - void MergeToArray(int nid, size_t begin, size_t* rows_indexes) { - size_t task_idx = GetTaskIdx(nid, begin); - - size_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left; - size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right; - - const size_t* left = mem_blocks_[task_idx]->Left(); - const size_t* right = mem_blocks_[task_idx]->Right(); - - std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result); - std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result); - } - - size_t GetTaskIdx(int nid, size_t begin) { - return blocks_offsets_[nid] + begin / BlockSize; - } - - protected: - struct BlockInfo{ - size_t n_left; - size_t n_right; - - size_t n_offset_left; - size_t n_offset_right; - - size_t* Left() { - return &left_data_[0]; - } - - size_t* Right() { - return &right_data_[0]; - } - private: - size_t left_data_[BlockSize]; - size_t right_data_[BlockSize]; - }; - std::vector> left_right_nodes_sizes_; - std::vector blocks_offsets_; - std::vector> mem_blocks_; - size_t max_n_tasks_ = 0; -}; - - } // namespace common } // namespace xgboost diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 2a8669f00dce..58c0e7bbfe71 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -290,6 +290,7 @@ void QuantileHistMaker::Builder::SetHistRowsAdder( } template +template void QuantileHistMaker::Builder::InitRoot( const GHistIndexMatrix &gmat, const DMatrix& fmat, @@ -307,7 +308,7 @@ void QuantileHistMaker::Builder::InitRoot( int sync_count = 0; hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree); - BuildLocalHistograms(gmat, p_tree, gpair_h); + BuildLocalHistograms(gmat, p_tree, gpair_h); hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree); this->InitNewNode(CPUExpandEntry::kRootNid, gmat, gpair_h, fmat, *p_tree); @@ -319,6 +320,7 @@ void QuantileHistMaker::Builder::InitRoot( } template +template void QuantileHistMaker::Builder::BuildLocalHistograms( const GHistIndexMatrix &gmat, RegTree *p_tree, @@ -350,7 +352,8 @@ void QuantileHistMaker::Builder::BuildLocalHistograms( auto rid_set = RowSetCollection::Elem(start_of_row_set + r.begin(), start_of_row_set + r.end(), nid); - BuildHist(gpair_h, rid_set, gmat, hist_buffer_.GetInitializedHist(tid, nid_in_set)); + hist_builder_.template BuildHist(gpair_h, rid_set, gmat, + hist_buffer_.GetInitializedHist(tid, nid_in_set)); }); builder_monitor_.Stop("BuildLocalHistograms"); @@ -439,6 +442,7 @@ void QuantileHistMaker::Builder::BuildNodeStats( } template +template void QuantileHistMaker::Builder::ExpandTree( const GHistIndexMatrix& gmat, const ColumnMatrix& column_matrix, @@ -450,7 +454,7 @@ void QuantileHistMaker::Builder::ExpandTree( Driver driver(static_cast(param_.grow_policy)); std::vector expand; - InitRoot(gmat, *p_fmat, p_tree, gpair_h, &num_leaves, &expand); + InitRoot(gmat, *p_fmat, p_tree, gpair_h, &num_leaves, &expand); driver.Push(expand[0]); int depth = 0; @@ -465,14 +469,14 @@ void QuantileHistMaker::Builder::ExpandTree( AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split); if (nodes_for_apply_split.size() != 0) { - ApplySplit(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree); + ApplySplit(nodes_for_apply_split, gmat, column_matrix, hist_, p_tree); SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree); int starting_index = std::numeric_limits::max(); int sync_count = 0; hist_rows_adder_->AddHistRows(this, &starting_index, &sync_count, p_tree); if (depth < param_.max_depth) { - BuildLocalHistograms(gmat, p_tree, gpair_h); + BuildLocalHistograms(gmat, p_tree, gpair_h); hist_synchronizer_->SyncHistograms(this, starting_index, sync_count, p_tree); } @@ -520,8 +524,11 @@ void QuantileHistMaker::Builder::Update( this->InitData(gmat, *p_fmat, *p_tree, gpair_ptr); - ExpandTree(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr); - + if (column_matrix.AnyMissing()) { + ExpandTree(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr); + } else { + ExpandTree(gmat, column_matrix, p_fmat, p_tree, *gpair_ptr); + } for (int nid = 0; nid < p_tree->param.num_nodes; ++nid) { p_tree->Stat(nid).loss_chg = snode_[nid].best.loss_chg; p_tree->Stat(nid).base_weight = snode_[nid].weight; @@ -867,165 +874,6 @@ void QuantileHistMaker::Builder::EvaluateSplits( builder_monitor_.Stop("EvaluateSplits"); } -// split row indexes (rid_span) to 2 parts (left_part, right_part) depending -// on comparison of indexes values (idx_span) and split point (split_cond) -// Handle dense columns -// Analog of std::stable_partition, but in no-inplace manner -template -inline std::pair PartitionDenseKernel(const common::DenseColumn& column, - common::Span rid_span, const int32_t split_cond, - common::Span left_part, common::Span right_part) { - const int32_t offset = column.GetBaseIdx(); - const BinIdxType* idx = column.GetFeatureBinIdxPtr().data(); - size_t* p_left_part = left_part.data(); - size_t* p_right_part = right_part.data(); - size_t nleft_elems = 0; - size_t nright_elems = 0; - - if (any_missing) { - for (auto rid : rid_span) { - if (column.IsMissing(rid)) { - if (default_left) { - p_left_part[nleft_elems++] = rid; - } else { - p_right_part[nright_elems++] = rid; - } - } else { - if ((static_cast(idx[rid]) + offset) <= split_cond) { - p_left_part[nleft_elems++] = rid; - } else { - p_right_part[nright_elems++] = rid; - } - } - } - } else { - for (auto rid : rid_span) { - if ((static_cast(idx[rid]) + offset) <= split_cond) { - p_left_part[nleft_elems++] = rid; - } else { - p_right_part[nright_elems++] = rid; - } - } - } - return {nleft_elems, nright_elems}; -} - -// Split row indexes (rid_span) to 2 parts (left_part, right_part) depending -// on comparison of indexes values (idx_span) and split point (split_cond). -// Handle sparse columns -template -inline std::pair PartitionSparseKernel( - const common::SparseColumn& column, - common::Span rid_span, const int32_t split_cond, - common::Span left_part, common::Span right_part) { - size_t* p_left_part = left_part.data(); - size_t* p_right_part = right_part.data(); - - size_t nleft_elems = 0; - size_t nright_elems = 0; - const size_t* row_data = column.GetRowData(); - const size_t column_size = column.Size(); - if (rid_span.size()) { // ensure that rid_span is nonempty range - // search first nonzero row with index >= rid_span.front() - const size_t* p = std::lower_bound(row_data, row_data + column_size, - rid_span.front()); - - if (p != row_data + column_size && *p <= rid_span.back()) { - size_t cursor = p - row_data; - - for (auto rid : rid_span) { - while (cursor < column_size - && column.GetRowIdx(cursor) < rid - && column.GetRowIdx(cursor) <= rid_span.back()) { - ++cursor; - } - if (cursor < column_size && column.GetRowIdx(cursor) == rid) { - if (static_cast(column.GetGlobalBinIdx(cursor)) <= split_cond) { - p_left_part[nleft_elems++] = rid; - } else { - p_right_part[nright_elems++] = rid; - } - ++cursor; - } else { - // missing value - if (default_left) { - p_left_part[nleft_elems++] = rid; - } else { - p_right_part[nright_elems++] = rid; - } - } - } - } else { // all rows in rid_span have missing values - if (default_left) { - std::copy(rid_span.begin(), rid_span.end(), p_left_part); - nleft_elems = rid_span.size(); - } else { - std::copy(rid_span.begin(), rid_span.end(), p_right_part); - nright_elems = rid_span.size(); - } - } - } - - return {nleft_elems, nright_elems}; -} - -template -template -void QuantileHistMaker::Builder::PartitionKernel( - const size_t node_in_set, const size_t nid, const common::Range1d range, - const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree) { - const size_t* rid = row_set_collection_[nid].begin; - - common::Span rid_span(rid + range.begin(), rid + range.end()); - common::Span left = partition_builder_.GetLeftBuffer(node_in_set, - range.begin(), range.end()); - common::Span right = partition_builder_.GetRightBuffer(node_in_set, - range.begin(), range.end()); - const bst_uint fid = tree[nid].SplitIndex(); - const bool default_left = tree[nid].DefaultLeft(); - const auto column_ptr = column_matrix.GetColumn(fid); - - std::pair child_nodes_sizes; - - if (column_ptr->GetType() == xgboost::common::kDenseColumn) { - const common::DenseColumn& column = - static_cast& >(*(column_ptr.get())); - if (default_left) { - if (column_matrix.AnyMissing()) { - child_nodes_sizes = PartitionDenseKernel(column, rid_span, - split_cond, left, right); - } else { - child_nodes_sizes = PartitionDenseKernel(column, rid_span, - split_cond, left, right); - } - } else { - if (column_matrix.AnyMissing()) { - child_nodes_sizes = PartitionDenseKernel(column, rid_span, - split_cond, left, right); - } else { - child_nodes_sizes = PartitionDenseKernel(column, rid_span, - split_cond, left, right); - } - } - } else { - const common::SparseColumn& column - = static_cast& >(*(column_ptr.get())); - if (default_left) { - child_nodes_sizes = PartitionSparseKernel(column, rid_span, - split_cond, left, right); - } else { - child_nodes_sizes = PartitionSparseKernel(column, rid_span, - split_cond, left, right); - } - } - - const size_t n_left = child_nodes_sizes.first; - const size_t n_right = child_nodes_sizes.second; - - partition_builder_.SetNLeftElems(node_in_set, range.begin(), range.end(), n_left); - partition_builder_.SetNRightElems(node_in_set, range.begin(), range.end(), n_right); -} - template void QuantileHistMaker::Builder::FindSplitConditions( const std::vector& nodes, @@ -1070,6 +918,7 @@ void QuantileHistMaker::Builder::AddSplitsToRowSet( } template +template void QuantileHistMaker::Builder::ApplySplit(const std::vector nodes, const GHistIndexMatrix& gmat, const ColumnMatrix& column_matrix, @@ -1102,16 +951,19 @@ void QuantileHistMaker::Builder::ApplySplit(const std::vector(node_in_set, nid, r, - split_conditions[node_in_set], column_matrix, *p_tree); + partition_builder_.template Partition(node_in_set, nid, r, + split_conditions[node_in_set], column_matrix, + *p_tree, row_set_collection_[nid].begin); break; case common::kUint16BinsTypeSize: - PartitionKernel(node_in_set, nid, r, - split_conditions[node_in_set], column_matrix, *p_tree); + partition_builder_.template Partition(node_in_set, nid, r, + split_conditions[node_in_set], column_matrix, + *p_tree, row_set_collection_[nid].begin); break; case common::kUint32BinsTypeSize: - PartitionKernel(node_in_set, nid, r, - split_conditions[node_in_set], column_matrix, *p_tree); + partition_builder_.template Partition(node_in_set, nid, r, + split_conditions[node_in_set], column_matrix, + *p_tree, row_set_collection_[nid].begin); break; default: CHECK(false); // no default behavior @@ -1268,24 +1120,6 @@ GradStats QuantileHistMaker::Builder::EnumerateSplit( template struct QuantileHistMaker::Builder; template struct QuantileHistMaker::Builder; -template void QuantileHistMaker::Builder::PartitionKernel( - const size_t node_in_set, const size_t nid, common::Range1d range, - const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree); -template void QuantileHistMaker::Builder::PartitionKernel( - const size_t node_in_set, const size_t nid, common::Range1d range, - const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree); -template void QuantileHistMaker::Builder::PartitionKernel( - const size_t node_in_set, const size_t nid, common::Range1d range, - const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree); -template void QuantileHistMaker::Builder::PartitionKernel( - const size_t node_in_set, const size_t nid, common::Range1d range, - const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree); -template void QuantileHistMaker::Builder::PartitionKernel( - const size_t node_in_set, const size_t nid, common::Range1d range, - const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree); -template void QuantileHistMaker::Builder::PartitionKernel( - const size_t node_in_set, const size_t nid, common::Range1d range, - const int32_t split_cond, const ColumnMatrix& column_matrix, const RegTree& tree); XGBOOST_REGISTER_TREE_UPDATER(FastHistMaker, "grow_fast_histmaker") .describe("(Deprecated, use grow_quantile_histmaker instead.)" diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index dae0f845ccd3..0ce106222881 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -28,6 +28,7 @@ #include "../common/timer.h" #include "../common/hist_util.h" #include "../common/row_set.h" +#include "../common/partition_builder.h" #include "../common/column_matrix.h" namespace xgboost { @@ -291,14 +292,6 @@ class QuantileHistMaker: public TreeUpdater { DMatrix* p_fmat, RegTree* p_tree); - inline void BuildHist(const std::vector& gpair, - const RowSetCollection::Elem row_indices, - const GHistIndexMatrix& gmat, - GHistRowT hist) { - hist_builder_.BuildHist(gpair, row_indices, gmat, hist, - data_layout_ != DataLayout::kSparseData); - } - inline void SubtractionTrick(GHistRowT self, GHistRowT sibling, GHistRowT parent) { @@ -338,17 +331,13 @@ class QuantileHistMaker: public TreeUpdater { const HistCollection& hist, const RegTree& tree); + template void ApplySplit(std::vector nodes, const GHistIndexMatrix& gmat, const ColumnMatrix& column_matrix, const HistCollection& hist, RegTree* p_tree); - template - void PartitionKernel(const size_t node_in_set, const size_t nid, const common::Range1d range, - const int32_t split_cond, - const ColumnMatrix& column_matrix, const RegTree& tree); - void AddSplitsToRowSet(const std::vector& nodes, RegTree* p_tree); @@ -376,10 +365,11 @@ class QuantileHistMaker: public TreeUpdater { // else - there are missing values bool SplitContainsMissingValues(const GradStats e, const NodeEntry& snode); + template void BuildLocalHistograms(const GHistIndexMatrix &gmat, RegTree *p_tree, const std::vector &gpair_h); - + template void InitRoot(const GHistIndexMatrix &gmat, const DMatrix& fmat, RegTree *p_tree, @@ -402,7 +392,7 @@ class QuantileHistMaker: public TreeUpdater { const DMatrix& fmat, const std::vector &gpair_h, const std::vector& nodes_for_apply_split, RegTree *p_tree); - + template void ExpandTree(const GHistIndexMatrix& gmat, const ColumnMatrix& column_matrix, DMatrix* p_fmat, diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc index 75530fc53676..2cb88855420c 100644 --- a/tests/cpp/common/test_column_matrix.cc +++ b/tests/cpp/common/test_column_matrix.cc @@ -23,19 +23,19 @@ TEST(DenseColumn, Test) { for (auto j = 0ull; j < dmat->Info().num_col_; j++) { switch (column_matrix.GetTypeSize()) { case kUint8BinsTypeSize: { - auto col = column_matrix.GetColumn(j); + auto col = column_matrix.GetColumn(j); ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], (*col.get()).GetGlobalBinIdx(i)); } break; case kUint16BinsTypeSize: { - auto col = column_matrix.GetColumn(j); + auto col = column_matrix.GetColumn(j); ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], (*col.get()).GetGlobalBinIdx(i)); } break; case kUint32BinsTypeSize: { - auto col = column_matrix.GetColumn(j); + auto col = column_matrix.GetColumn(j); ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], (*col.get()).GetGlobalBinIdx(i)); } @@ -68,17 +68,17 @@ TEST(SparseColumn, Test) { column_matrix.Init(gmat, 0.5); switch (column_matrix.GetTypeSize()) { case kUint8BinsTypeSize: { - auto col = column_matrix.GetColumn(0); + auto col = column_matrix.GetColumn(0); CheckSparseColumn(*col.get(), gmat); } break; case kUint16BinsTypeSize: { - auto col = column_matrix.GetColumn(0); + auto col = column_matrix.GetColumn(0); CheckSparseColumn(*col.get(), gmat); } break; case kUint32BinsTypeSize: { - auto col = column_matrix.GetColumn(0); + auto col = column_matrix.GetColumn(0); CheckSparseColumn(*col.get(), gmat); } break; @@ -89,7 +89,7 @@ TEST(SparseColumn, Test) { template inline void CheckColumWithMissingValue(const Column& col_input, const GHistIndexMatrix& gmat) { - const DenseColumn& col = static_cast& >(col_input); + const DenseColumn& col = static_cast& >(col_input); for (auto i = 0ull; i < col.Size(); i++) { if (col.IsMissing(i)) continue; EXPECT_EQ(gmat.index[gmat.row_ptr[i]], @@ -109,17 +109,17 @@ TEST(DenseColumnWithMissing, Test) { column_matrix.Init(gmat, 0.2); switch (column_matrix.GetTypeSize()) { case kUint8BinsTypeSize: { - auto col = column_matrix.GetColumn(0); + auto col = column_matrix.GetColumn(0); CheckColumWithMissingValue(*col.get(), gmat); } break; case kUint16BinsTypeSize: { - auto col = column_matrix.GetColumn(0); + auto col = column_matrix.GetColumn(0); CheckColumWithMissingValue(*col.get(), gmat); } break; case kUint32BinsTypeSize: { - auto col = column_matrix.GetColumn(0); + auto col = column_matrix.GetColumn(0); CheckColumWithMissingValue(*col.get(), gmat); } break; diff --git a/tests/cpp/common/test_partition_builder.cc b/tests/cpp/common/test_partition_builder.cc index 45f65ea3ba6b..885b924e71c1 100644 --- a/tests/cpp/common/test_partition_builder.cc +++ b/tests/cpp/common/test_partition_builder.cc @@ -4,6 +4,7 @@ #include #include "../../../src/common/row_set.h" +#include "../../../src/common/partition_builder.h" #include "../helpers.h" namespace xgboost { diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index 0009cd238424..79ebfc0c1e6a 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -309,7 +309,7 @@ class QuantileHistMock : public QuantileHistMaker { RealImpl::InitData(gmat, fmat, tree, &gpair); this->hist_.AddHistRow(nid); this->hist_.AllocateAllData(); - this->BuildHist(gpair, this->row_set_collection_[nid], + this->hist_builder_.template BuildHist(gpair, this->row_set_collection_[nid], gmat, this->hist_[nid]); // Check if number of histogram bins is correct @@ -350,7 +350,7 @@ class QuantileHistMock : public QuantileHistMaker { RealImpl::InitData(gmat, *dmat, tree, &row_gpairs); this->hist_.AddHistRow(0); this->hist_.AllocateAllData(); - this->BuildHist(row_gpairs, this->row_set_collection_[0], + this->hist_builder_.template BuildHist(row_gpairs, this->row_set_collection_[0], gmat, this->hist_[0]); RealImpl::InitNewNode(0, gmat, row_gpairs, *dmat, tree); @@ -482,8 +482,13 @@ class QuantileHistMock : public QuantileHistMaker { }); const size_t task_id = RealImpl::partition_builder_.GetTaskIdx(0, 0); RealImpl::partition_builder_.AllocateForTask(task_id); - this->template PartitionKernel(0, 0, common::Range1d(0, kNRows), - split, cm, tree); + if (cm.AnyMissing()) { + RealImpl::partition_builder_.template Partition(0, 0, common::Range1d(0, kNRows), + split, cm, tree, this->row_set_collection_[0].begin); + } else { + RealImpl::partition_builder_.template Partition(0, 0, common::Range1d(0, kNRows), + split, cm, tree, this->row_set_collection_[0].begin); + } RealImpl::partition_builder_.CalculateRowOffsets(); ASSERT_EQ(RealImpl::partition_builder_.GetNLeftElems(0), left_cnt); ASSERT_EQ(RealImpl::partition_builder_.GetNRightElems(0), right_cnt);