Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use adapter to initialize column matrix. #7912

Merged
merged 3 commits into from May 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
58 changes: 33 additions & 25 deletions src/common/column_matrix.h
Expand Up @@ -16,6 +16,7 @@
#include <utility> // std::move
#include <vector>

#include "../data/adapter.h"
#include "../data/gradient_index.h"
#include "hist_util.h"

Expand Down Expand Up @@ -128,17 +129,17 @@ class DenseColumnIter : public Column<BinIdxT> {
/**
* \brief Column major matrix for gradient index. This matrix contains both dense column
* and sparse column, the type of the column is controlled by sparse threshold. When the
* number of missing values in a column is below the threshold it classified as dense
* number of missing values in a column is below the threshold it's classified as dense
* column.
*/
class ColumnMatrix {
public:
// get number of features
bst_feature_t GetNumFeature() const { return static_cast<bst_feature_t>(type_.size()); }

// construct column matrix from GHistIndexMatrix
void Init(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
int32_t n_threads) {
template <typename Batch>
void Init(Batch const& batch, float missing, GHistIndexMatrix const& gmat,
double sparse_threshold, int32_t n_threads) {
auto const nfeature = static_cast<bst_feature_t>(gmat.cut.Ptrs().size() - 1);
const size_t nrow = gmat.row_ptr.size() - 1;
// identify type of each column
Expand Down Expand Up @@ -190,21 +191,29 @@ class ColumnMatrix {
any_missing_ = !gmat.IsDense();

missing_flags_.clear();

// pre-fill index_ for dense columns
BinTypeSize gmat_bin_size = gmat.index.GetBinTypeSize();
if (!any_missing_) {
missing_flags_.resize(feature_offsets_[nfeature], false);
// row index is compressed, we need to dispatch it.
DispatchBinType(gmat_bin_size, [&, nrow, nfeature, n_threads](auto t) {
using RowBinIdxT = decltype(t);
SetIndexNoMissing(page, gmat.index.data<RowBinIdxT>(), nrow, nfeature, n_threads);
SetIndexNoMissing(gmat.index.data<RowBinIdxT>(), nrow, nfeature, n_threads);
});
} else {
missing_flags_.resize(feature_offsets_[nfeature], true);
SetIndexMixedColumns(page, gmat.index.data<uint32_t>(), gmat, nfeature);
SetIndexMixedColumns(batch, gmat.index.data<uint32_t>(), gmat, nfeature, missing);
}
}

// construct column matrix from GHistIndexMatrix
void Init(SparsePage const& page, const GHistIndexMatrix& gmat, double sparse_threshold,
int32_t n_threads) {
auto batch = data::SparsePageAdapterBatch{page.GetView()};
this->Init(batch, std::numeric_limits<float>::quiet_NaN(), gmat, sparse_threshold, n_threads);
}

/* Set the number of bytes based on numeric limit of maximum number of bins provided by user */
void SetTypeSize(size_t max_bin_per_feat) {
if ((max_bin_per_feat - 1) <= static_cast<int>(std::numeric_limits<uint8_t>::max())) {
Expand Down Expand Up @@ -241,8 +250,8 @@ class ColumnMatrix {
// all columns are dense column and has no missing value
// FIXME(jiamingy): We don't need a column matrix if there's no missing value.
template <typename RowBinIdxT>
void SetIndexNoMissing(SparsePage const& page, RowBinIdxT const* row_index,
const size_t n_samples, const size_t n_features, int32_t n_threads) {
void SetIndexNoMissing(RowBinIdxT const* row_index, const size_t n_samples,
const size_t n_features, int32_t n_threads) {
DispatchBinType(bins_type_size_, [&](auto t) {
using ColumnBinT = decltype(t);
auto column_index = Span<ColumnBinT>{reinterpret_cast<ColumnBinT*>(index_.data()),
Expand All @@ -263,10 +272,12 @@ class ColumnMatrix {
/**
* \brief Set column index for both dense and sparse columns
*/
void SetIndexMixedColumns(SparsePage const& page, uint32_t const* row_index,
const GHistIndexMatrix& gmat, size_t n_features) {
template <typename Batch>
void SetIndexMixedColumns(Batch const& batch, uint32_t const* row_index,
const GHistIndexMatrix& gmat, size_t n_features, float missing) {
std::vector<size_t> num_nonzeros;
num_nonzeros.resize(n_features, 0);
auto is_valid = data::IsValidFunctor {missing};

DispatchBinType(bins_type_size_, [&](auto t) {
using ColumnBinT = decltype(t);
Expand All @@ -276,7 +287,8 @@ class ColumnMatrix {
if (type_[fid] == kDenseColumn) {
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
begin[rid] = bin_id - index_base_[fid];
// not thread-safe with bool vector.
// not thread-safe with bool vector. FIXME(jiamingy): We can directly assign
// kMissingId to the index to avoid missing flags.
missing_flags_[feature_offsets_[fid] + rid] = false;
} else {
ColumnBinT* begin = &local_index[feature_offsets_[fid]];
Expand All @@ -286,22 +298,18 @@ class ColumnMatrix {
}
};

const xgboost::Entry* data_ptr = page.data.HostVector().data();
const std::vector<bst_row_t>& offset_vec = page.offset.HostVector();
const size_t batch_size = gmat.Size();
CHECK_LT(batch_size, offset_vec.size());
size_t k{0};
for (size_t rid = 0; rid < batch_size; ++rid) {
const size_t ibegin = gmat.row_ptr[rid];
const size_t iend = gmat.row_ptr[rid + 1];
const size_t size = offset_vec[rid + 1] - offset_vec[rid];
SparsePage::Inst inst = {data_ptr + offset_vec[rid], size};

CHECK_EQ(ibegin + inst.size(), iend);
size_t j = 0;
for (size_t i = ibegin; i < iend; ++i, ++j) {
const uint32_t bin_id = row_index[i];
auto fid = inst[j].index;
get_bin_idx(bin_id, rid, fid);
auto line = batch.GetLine(rid);
for (size_t i = 0; i < line.Size(); ++i) {
auto coo = line.GetElement(i);
if (is_valid(coo)) {
auto fid = coo.column_idx;
const uint32_t bin_id = row_index[k];
get_bin_idx(bin_id, rid, fid);
++k;
}
}
}
});
Expand Down
2 changes: 2 additions & 0 deletions src/common/hist_util.h
Expand Up @@ -211,6 +211,8 @@ auto DispatchBinType(BinTypeSize type, Fn&& fn) {
return fn(uint32_t{});
}
}
LOG(FATAL) << "Unreachable";
return fn(uint32_t{});
}

/**
Expand Down
18 changes: 18 additions & 0 deletions src/data/adapter.h
Expand Up @@ -1131,6 +1131,24 @@ class RecordBatchesIterAdapter: public dmlc::DataIter<ArrowColumnarBatchVec> {
struct ArrowSchemaImporter schema_;
ArrowColumnarBatchVec batches_;
};

class SparsePageAdapterBatch {
HostSparsePageView page_;

public:
struct Line {
SparsePage::Inst inst;
bst_row_t ridx;
COOTuple GetElement(size_t idx) const {
return COOTuple{ridx, inst.data()[idx].index, inst.data()[idx].fvalue};
}
size_t Size() const { return inst.size(); }
};

explicit SparsePageAdapterBatch(HostSparsePageView page) : page_{std::move(page)} {}
Line GetLine(size_t ridx) const { return Line{page_[ridx], ridx}; }
size_t Size() const { return page_.Size(); }
};
}; // namespace data
} // namespace xgboost
#endif // XGBOOST_DATA_ADAPTER_H_
77 changes: 29 additions & 48 deletions tests/cpp/common/test_column_matrix.cc
Expand Up @@ -31,34 +31,33 @@ TEST(DenseColumn, Test) {
ASSERT_FALSE(column_matrix.AnyMissing());
for (auto i = 0ull; i < dmat->Info().num_row_; i++) {
for (auto j = 0ull; j < dmat->Info().num_col_; j++) {
switch (column_matrix.GetTypeSize()) {
case kUint8BinsTypeSize: {
auto col = column_matrix.DenseColumn<uint8_t, false>(j);
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], col.GetGlobalBinIdx(i));
} break;
case kUint16BinsTypeSize: {
auto col = column_matrix.DenseColumn<uint16_t, false>(j);
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], col.GetGlobalBinIdx(i));
} break;
case kUint32BinsTypeSize: {
auto col = column_matrix.DenseColumn<uint32_t, false>(j);
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], col.GetGlobalBinIdx(i));
} break;
}
DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
using T = decltype(dtype);
auto col = column_matrix.DenseColumn<T, false>(j);
ASSERT_EQ(gmat.index[i * dmat->Info().num_col_ + j], col.GetGlobalBinIdx(i));
});
}
}
}
}

template <typename BinIdxType>
inline void CheckSparseColumn(const SparseColumnIter<BinIdxType>& col_input,
const GHistIndexMatrix& gmat) {
const SparseColumnIter<BinIdxType>& col =
static_cast<const SparseColumnIter<BinIdxType>&>(col_input);
void CheckSparseColumn(SparseColumnIter<BinIdxType>* p_col, const GHistIndexMatrix& gmat) {
auto& col = *p_col;

size_t n_samples = gmat.row_ptr.size() - 1;
ASSERT_EQ(col.Size(), gmat.index.Size());
for (auto i = 0ull; i < col.Size(); i++) {
ASSERT_EQ(gmat.index[gmat.row_ptr[col.GetRowIdx(i)]], col.GetGlobalBinIdx(i));
}

for (auto i = 0ull; i < n_samples; i++) {
if (col[i] == Column<BinIdxType>::kMissingId) {
auto beg = gmat.row_ptr[i];
auto end = gmat.row_ptr[i + 1];
ASSERT_EQ(end - beg, 0);
}
}
}

TEST(SparseColumn, Test) {
Expand All @@ -72,26 +71,17 @@ TEST(SparseColumn, Test) {
for (auto const& page : dmat->GetBatches<SparsePage>()) {
column_matrix.Init(page, gmat, 1.0, common::OmpGetNumThreads(0));
}
switch (column_matrix.GetTypeSize()) {
case kUint8BinsTypeSize: {
auto col = column_matrix.SparseColumn<uint8_t>(0, 0);
CheckSparseColumn(col, gmat);
} break;
case kUint16BinsTypeSize: {
auto col = column_matrix.SparseColumn<uint16_t>(0, 0);
CheckSparseColumn(col, gmat);
} break;
case kUint32BinsTypeSize: {
auto col = column_matrix.SparseColumn<uint32_t>(0, 0);
CheckSparseColumn(col, gmat);
} break;
}
common::DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
using T = decltype(dtype);
auto col = column_matrix.SparseColumn<T>(0, 0);
CheckSparseColumn(&col, gmat);
});
}
}

template <typename BinIdxType>
inline void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
const GHistIndexMatrix& gmat) {
void CheckColumWithMissingValue(const DenseColumnIter<BinIdxType, true>& col,
const GHistIndexMatrix& gmat) {
for (auto i = 0ull; i < col.Size(); i++) {
if (col.IsMissing(i)) continue;
EXPECT_EQ(gmat.index[gmat.row_ptr[i]], col.GetGlobalBinIdx(i));
Expand All @@ -110,20 +100,11 @@ TEST(DenseColumnWithMissing, Test) {
column_matrix.Init(page, gmat, 0.2, common::OmpGetNumThreads(0));
}
ASSERT_TRUE(column_matrix.AnyMissing());
switch (column_matrix.GetTypeSize()) {
case kUint8BinsTypeSize: {
auto col = column_matrix.DenseColumn<uint8_t, true>(0);
CheckColumWithMissingValue(col, gmat);
} break;
case kUint16BinsTypeSize: {
auto col = column_matrix.DenseColumn<uint16_t, true>(0);
CheckColumWithMissingValue(col, gmat);
} break;
case kUint32BinsTypeSize: {
auto col = column_matrix.DenseColumn<uint32_t, true>(0);
CheckColumWithMissingValue(col, gmat);
} break;
}
DispatchBinType(column_matrix.GetTypeSize(), [&](auto dtype) {
using T = decltype(dtype);
auto col = column_matrix.DenseColumn<T, true>(0);
CheckColumWithMissingValue(col, gmat);
});
}
}

Expand Down