From 3b23d4d6894138d9e4eee7dec11c290acd8d7d0b Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 12 Jul 2022 04:04:53 +0800 Subject: [PATCH 1/4] Split up column matrix initialization. This PR splits the column matrix initialization into 2 steps, the first one initializes the storage while the second one does the transpose. By doing so, we can reuse the code for Quantile DMatrix. --- amalgamation/xgboost-all0.cc | 3 + src/common/column_matrix.cc | 65 ++++++++++++++++++++ src/common/column_matrix.h | 112 +++++++++++------------------------ src/data/gradient_index.h | 5 +- 4 files changed, 105 insertions(+), 80 deletions(-) create mode 100644 src/common/column_matrix.cc diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc index cf02b07a097e..2cbde50a0f41 100644 --- a/amalgamation/xgboost-all0.cc +++ b/amalgamation/xgboost-all0.cc @@ -69,7 +69,10 @@ #include "../src/learner.cc" #include "../src/logging.cc" #include "../src/global_config.cc" + +// common #include "../src/common/common.cc" +#include "../src/common/column_matrix.cc" #include "../src/common/random.cc" #include "../src/common/charconv.cc" #include "../src/common/timer.cc" diff --git a/src/common/column_matrix.cc b/src/common/column_matrix.cc new file mode 100644 index 000000000000..91977b96dcdf --- /dev/null +++ b/src/common/column_matrix.cc @@ -0,0 +1,65 @@ +/*! + * Copyright 2017-2022 by XGBoost Contributors + * \brief Utility for fast column-wise access + */ +#include "column_matrix.h" + +namespace xgboost { +namespace common { +void ColumnMatrix::InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold) { + auto const nfeature = gmat.Features(); + const size_t nrow = gmat.Size(); + // identify type of each column + type_.resize(nfeature); + + uint32_t max_val = std::numeric_limits::max(); + for (bst_feature_t fid = 0; fid < nfeature; ++fid) { + CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val); + } + + bool all_dense_column = true; + + std::vector feature_counts(nfeature, 0); + gmat.GetFeatureCounts(feature_counts.data()); + + // classify features + for (bst_feature_t fid = 0; fid < nfeature; ++fid) { + if (static_cast(feature_counts[fid]) < sparse_threshold * nrow) { + type_[fid] = kSparseColumn; + all_dense_column = false; + } else { + type_[fid] = kDenseColumn; + } + } + + // want to compute storage boundary for each feature + // using variants of prefix sum scan + feature_offsets_.resize(nfeature + 1); + size_t accum_index = 0; + feature_offsets_[0] = accum_index; + for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) { + if (type_[fid - 1] == kDenseColumn) { + accum_index += static_cast(nrow); + } else { + accum_index += feature_counts[fid - 1]; + } + feature_offsets_[fid] = accum_index; + } + + SetTypeSize(gmat.max_num_bins); + auto storage_size = + feature_offsets_.back() * static_cast>(bins_type_size_); + index_.resize(storage_size, 0); + if (!all_dense_column) { + row_ind_.resize(feature_offsets_[nfeature]); + } + + // store least bin id for each feature + index_base_ = const_cast(gmat.cut.Ptrs().data()); + + any_missing_ = !gmat.IsDense(); + + missing_flags_.clear(); +} +} // namespace common +} // namespace xgboost diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index 45614e6e29ae..055c077700af 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -133,77 +133,34 @@ class DenseColumnIter : public Column { * column. */ class ColumnMatrix { + void InitStorage(GHistIndexMatrix const& gmat, double sparse_threshold); + public: // get number of features bst_feature_t GetNumFeature() const { return static_cast(type_.size()); } - template - void Init(Batch const& batch, float missing, GHistIndexMatrix const& gmat, - double sparse_threshold, int32_t n_threads) { - auto const nfeature = static_cast(gmat.cut.Ptrs().size() - 1); - const size_t nrow = gmat.row_ptr.size() - 1; - // identify type of each column - feature_counts_.resize(nfeature); - type_.resize(nfeature); - std::fill(feature_counts_.begin(), feature_counts_.end(), 0); - uint32_t max_val = std::numeric_limits::max(); - for (bst_feature_t fid = 0; fid < nfeature; ++fid) { - CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val); - } - - bool all_dense_column = true; - gmat.GetFeatureCounts(&feature_counts_[0]); - // classify features - for (bst_feature_t fid = 0; fid < nfeature; ++fid) { - if (static_cast(feature_counts_[fid]) < sparse_threshold * nrow) { - type_[fid] = kSparseColumn; - all_dense_column = false; - } else { - type_[fid] = kDenseColumn; - } - } - - // want to compute storage boundary for each feature - // using variants of prefix sum scan - feature_offsets_.resize(nfeature + 1); - size_t accum_index = 0; - feature_offsets_[0] = accum_index; - for (bst_feature_t fid = 1; fid < nfeature + 1; ++fid) { - if (type_[fid - 1] == kDenseColumn) { - accum_index += static_cast(nrow); - } else { - accum_index += feature_counts_[fid - 1]; - } - feature_offsets_[fid] = accum_index; - } - - SetTypeSize(gmat.max_num_bins); - auto storage_size = - feature_offsets_.back() * static_cast>(bins_type_size_); - index_.resize(storage_size, 0); - if (!all_dense_column) { - row_ind_.resize(feature_offsets_[nfeature]); - } - - // store least bin id for each feature - index_base_ = const_cast(gmat.cut.Ptrs().data()); - - any_missing_ = !gmat.IsDense(); - - missing_flags_.clear(); + ColumnMatrix() = default; + ColumnMatrix(GHistIndexMatrix const& gmat, double sparse_threshold) { + this->InitStorage(gmat, sparse_threshold); + } + template + void PushBatch(int32_t n_threads, Batch const& batch, float missing, GHistIndexMatrix const& gmat, + size_t base_rowid) { // pre-fill index_ for dense columns - BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize(); + auto n_features = gmat.Features(); + auto batch_size = batch.Size(); if (!any_missing_) { - missing_flags_.resize(feature_offsets_[nfeature], false); + missing_flags_.resize(feature_offsets_[n_features], false); // row index is compressed, we need to dispatch it. - DispatchBinType(gmat_bin_size, [&, nrow, nfeature, n_threads](auto t) { + DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) { using RowBinIdxT = decltype(t); - SetIndexNoMissing(gmat.index.data(), nrow, nfeature, n_threads); + SetIndexNoMissing(base_rowid, gmat.index.data(), batch_size, n_features, + n_threads); }); } else { - missing_flags_.resize(feature_offsets_[nfeature], true); - SetIndexMixedColumns(batch, gmat.index.data(), gmat, nfeature, missing); + missing_flags_.resize(feature_offsets_[n_features], true); + SetIndexMixedColumns(base_rowid, batch, gmat, n_features, missing); } } @@ -211,7 +168,9 @@ class ColumnMatrix { void Init(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold, int32_t n_threads) { auto batch = data::SparsePageAdapterBatch{page.GetView()}; - this->Init(batch, std::numeric_limits::quiet_NaN(), gmat, sparse_threshold, n_threads); + this->InitStorage(gmat, sparse_threshold); + // ignore base row id here as we always has one column matrix for each sparse page. + this->PushBatch(n_threads, batch, std::numeric_limits::quiet_NaN(), gmat, 0); } /* Set the number of bytes based on numeric limit of maximum number of bins provided by user */ @@ -250,17 +209,17 @@ class ColumnMatrix { // all columns are dense column and has no missing value // FIXME(jiamingy): We don't need a column matrix if there's no missing value. template - void SetIndexNoMissing(RowBinIdxT const* row_index, const size_t n_samples, + void SetIndexNoMissing(bst_row_t base_rowid, RowBinIdxT const* row_index, const size_t n_samples, const size_t n_features, int32_t n_threads) { DispatchBinType(bins_type_size_, [&](auto t) { using ColumnBinT = decltype(t); auto column_index = Span{reinterpret_cast(index_.data()), index_.size() / sizeof(ColumnBinT)}; ParallelFor(n_samples, n_threads, [&](auto rid) { + rid += base_rowid; const size_t ibegin = rid * n_features; const size_t iend = (rid + 1) * n_features; - size_t j = 0; - for (size_t i = ibegin; i < iend; ++i, ++j) { + for (size_t i = ibegin, j = 0; i < iend; ++i, ++j) { const size_t idx = feature_offsets_[j]; // No need to add offset, as row index is compressed and stores the local index column_index[idx + rid] = row_index[i]; @@ -273,16 +232,15 @@ class ColumnMatrix { * \brief Set column index for both dense and sparse columns */ template - void SetIndexMixedColumns(Batch const& batch, uint32_t const* row_index, - const GHistIndexMatrix& gmat, size_t n_features, float missing) { - std::vector num_nonzeros; - num_nonzeros.resize(n_features, 0); + void SetIndexMixedColumns(size_t base_rowid, Batch const& batch, const GHistIndexMatrix& gmat, + size_t n_features, float missing) { + auto const* row_index = gmat.index.data() + gmat.row_ptr[base_rowid]; auto is_valid = data::IsValidFunctor {missing}; DispatchBinType(bins_type_size_, [&](auto t) { using ColumnBinT = decltype(t); ColumnBinT* local_index = reinterpret_cast(index_.data()); - + num_nonzeros_.resize(n_features, 0); auto get_bin_idx = [&](auto bin_id, auto rid, bst_feature_t fid) { if (type_[fid] == kDenseColumn) { ColumnBinT* begin = &local_index[feature_offsets_[fid]]; @@ -292,13 +250,13 @@ class ColumnMatrix { missing_flags_[feature_offsets_[fid] + rid] = false; } else { ColumnBinT* begin = &local_index[feature_offsets_[fid]]; - begin[num_nonzeros[fid]] = bin_id - index_base_[fid]; - row_ind_[feature_offsets_[fid] + num_nonzeros[fid]] = rid; - ++num_nonzeros[fid]; + begin[num_nonzeros_[fid]] = bin_id - index_base_[fid]; + row_ind_[feature_offsets_[fid] + num_nonzeros_[fid]] = rid; + ++num_nonzeros_[fid]; } }; - const size_t batch_size = gmat.Size(); + size_t const batch_size = batch.Size(); size_t k{0}; for (size_t rid = 0; rid < batch_size; ++rid) { auto line = batch.GetLine(rid); @@ -307,7 +265,7 @@ class ColumnMatrix { if (is_valid(coo)) { auto fid = coo.column_idx; const uint32_t bin_id = row_index[k]; - get_bin_idx(bin_id, rid, fid); + get_bin_idx(bin_id, rid + base_rowid, fid); ++k; } } @@ -324,7 +282,6 @@ class ColumnMatrix { // IO procedures for external memory. bool Read(dmlc::SeekStream* fi, uint32_t const* index_base) { fi->Read(&index_); - fi->Read(&feature_counts_); #if !DMLC_LITTLE_ENDIAN // s390x std::vector::type> int_types; @@ -361,7 +318,6 @@ class ColumnMatrix { sizeof(uint64_t); }; write_vec(index_); - write_vec(feature_counts_); #if !DMLC_LITTLE_ENDIAN // s390x std::vector::type> int_types(type_.size()); @@ -391,11 +347,13 @@ class ColumnMatrix { private: std::vector index_; - std::vector feature_counts_; std::vector type_; + /* indptr of a CSC matrix. */ std::vector row_ind_; /* indicate where each column's index and row_ind is stored. */ std::vector feature_offsets_; + /* The number of nnz of each column. */ + std::vector num_nonzeros_; // index_base_[fid]: least bin id for feature fid uint32_t const* index_base_; diff --git a/src/data/gradient_index.h b/src/data/gradient_index.h index 8096b9c98e16..d7091757df52 100644 --- a/src/data/gradient_index.h +++ b/src/data/gradient_index.h @@ -109,9 +109,8 @@ class GHistIndexMatrix { */ size_t RowIdx(size_t ridx) const { return row_ptr[ridx - base_rowid]; } - bst_row_t Size() const { - return row_ptr.empty() ? 0 : row_ptr.size() - 1; - } + bst_row_t Size() const { return row_ptr.empty() ? 0 : row_ptr.size() - 1; } + bst_feature_t Features() const { return cut.Ptrs().size() - 1; } bool ReadColumnPage(dmlc::SeekStream* fi); size_t WriteColumnPage(dmlc::Stream* fo) const; From b5a85756bdc4a9f222d9d81c4c16f31755e2f55d Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 13 Jul 2022 10:13:00 +0800 Subject: [PATCH 2/4] msvc. --- src/common/column_matrix.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index 055c077700af..9bed60c8e430 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -149,13 +149,12 @@ class ColumnMatrix { size_t base_rowid) { // pre-fill index_ for dense columns auto n_features = gmat.Features(); - auto batch_size = batch.Size(); if (!any_missing_) { missing_flags_.resize(feature_offsets_[n_features], false); // row index is compressed, we need to dispatch it. DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) { using RowBinIdxT = decltype(t); - SetIndexNoMissing(base_rowid, gmat.index.data(), batch_size, n_features, + SetIndexNoMissing(base_rowid, gmat.index.data(), batch.Size(), n_features, n_threads); }); } else { From bc408775a7942dbf14cf479f1309235f829f1062 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 13 Jul 2022 10:45:49 +0800 Subject: [PATCH 3/4] MSVC. --- src/common/column_matrix.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index 9bed60c8e430..2a8fc2d69ca2 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -152,10 +152,9 @@ class ColumnMatrix { if (!any_missing_) { missing_flags_.resize(feature_offsets_[n_features], false); // row index is compressed, we need to dispatch it. - DispatchBinType(gmat.index.GetBinTypeSize(), [&](auto t) { + DispatchBinType(gmat.index.GetBinTypeSize(), [&, size = batch.Size()](auto t) { using RowBinIdxT = decltype(t); - SetIndexNoMissing(base_rowid, gmat.index.data(), batch.Size(), n_features, - n_threads); + SetIndexNoMissing(base_rowid, gmat.index.data(), size, n_features, n_threads); }); } else { missing_flags_.resize(feature_offsets_[n_features], true); From fe471d10c46e305853d2fee2b7dc069d21bb4c45 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 13 Jul 2022 11:18:25 +0800 Subject: [PATCH 4/4] MSVC. --- src/common/column_matrix.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/common/column_matrix.h b/src/common/column_matrix.h index 2a8fc2d69ca2..f16d1252f9a1 100644 --- a/src/common/column_matrix.h +++ b/src/common/column_matrix.h @@ -152,7 +152,8 @@ class ColumnMatrix { if (!any_missing_) { missing_flags_.resize(feature_offsets_[n_features], false); // row index is compressed, we need to dispatch it. - DispatchBinType(gmat.index.GetBinTypeSize(), [&, size = batch.Size()](auto t) { + DispatchBinType(gmat.index.GetBinTypeSize(), [&, size = batch.Size(), n_features = n_features, + n_threads = n_threads](auto t) { using RowBinIdxT = decltype(t); SetIndexNoMissing(base_rowid, gmat.index.data(), size, n_features, n_threads); });