Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small cleanups to various data types. #8086

Merged
merged 1 commit into from Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 4 additions & 4 deletions include/xgboost/data.h
Expand Up @@ -216,7 +216,7 @@ struct BatchParam {
/*! \brief The GPU device to use. */
int gpu_id {-1};
/*! \brief Maximum number of bins per feature for histograms. */
int max_bin{0};
bst_bin_t max_bin{0};
/*! \brief Hessian, used for sketching with future approx implementation. */
common::Span<float> hess;
/*! \brief Whether should DMatrix regenerate the batch. Only used for GHistIndex. */
Expand All @@ -226,17 +226,17 @@ struct BatchParam {

BatchParam() = default;
// GPU Hist
BatchParam(int32_t device, int32_t max_bin)
BatchParam(int32_t device, bst_bin_t max_bin)
: gpu_id{device}, max_bin{max_bin} {}
// Hist
BatchParam(int32_t max_bin, double sparse_thresh)
BatchParam(bst_bin_t max_bin, double sparse_thresh)
: max_bin{max_bin}, sparse_thresh{sparse_thresh} {}
// Approx
/**
* \brief Get batch with sketch weighted by hessian. The batch will be regenerated if
* the span is changed, so caller should keep the span for each iteration.
*/
BatchParam(int32_t max_bin, common::Span<float> hessian, bool regenerate)
BatchParam(bst_bin_t max_bin, common::Span<float> hessian, bool regenerate)
: max_bin{max_bin}, hess{hessian}, regen{regenerate} {}

bool operator!=(BatchParam const& other) const {
Expand Down
6 changes: 3 additions & 3 deletions src/common/hist_util.cc
Expand Up @@ -49,14 +49,14 @@ HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, int32_t n_threads, b
}

if (!use_sorted) {
HostSketchContainer container(max_bins, m->Info(), reduced, HostSketchContainer::UseGroup(info),
n_threads);
HostSketchContainer container(max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
HostSketchContainer::UseGroup(info), n_threads);
for (auto const& page : m->GetBatches<SparsePage>()) {
container.PushRowPage(page, info, hessian);
}
container.MakeCuts(&out);
} else {
SortedSketchContainer container{max_bins, m->Info(), reduced,
SortedSketchContainer container{max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
HostSketchContainer::UseGroup(info), n_threads};
for (auto const& page : m->GetBatches<SortedCSCPage>()) {
container.PushColPage(page, info, hessian);
Expand Down
19 changes: 9 additions & 10 deletions src/common/quantile.cc
Expand Up @@ -86,22 +86,22 @@ void SketchContainerImpl<WQSketch>::PushRowPage(SparsePage const &page, MetaInfo

template <typename Batch>
void HostSketchContainer::PushAdapterBatch(Batch const &batch, size_t base_rowid,
MetaInfo const &info, size_t nnz, float missing) {
MetaInfo const &info, float missing) {
auto const &h_weights =
(use_group_ind_ ? detail::UnrollGroupWeights(info) : info.weights_.HostVector());

auto is_valid = data::IsValidFunctor{missing};
auto weights = OptionalWeights{Span<float const>{h_weights}};
// the nnz from info is not reliable as sketching might be the first place to go through
// the data.
auto is_dense = nnz == info.num_col_ * info.num_row_;
this->PushRowPageImpl(batch, base_rowid, weights, nnz, info.num_col_, is_dense, is_valid);
auto is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
this->PushRowPageImpl(batch, base_rowid, weights, info.num_nonzero_, info.num_col_, is_dense,
is_valid);
}

#define INSTANTIATE(_type) \
template void HostSketchContainer::PushAdapterBatch<data::_type>( \
data::_type const &batch, size_t base_rowid, MetaInfo const &info, size_t nnz, \
float missing);
#define INSTANTIATE(_type) \
template void HostSketchContainer::PushAdapterBatch<data::_type>( \
data::_type const &batch, size_t base_rowid, MetaInfo const &info, float missing);

INSTANTIATE(ArrayAdapterBatch)
INSTANTIATE(CSRArrayAdapterBatch)
Expand Down Expand Up @@ -436,11 +436,10 @@ void SketchContainerImpl<WQSketch>::MakeCuts(HistogramCuts* cuts) {
template class SketchContainerImpl<WQuantileSketch<float, float>>;
template class SketchContainerImpl<WXQuantileSketch<float, float>>;

HostSketchContainer::HostSketchContainer(int32_t max_bins, MetaInfo const &info,
HostSketchContainer::HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
std::vector<size_t> columns_size, bool use_group,
int32_t n_threads)
: SketchContainerImpl{columns_size, max_bins, info.feature_types.ConstHostSpan(), use_group,
n_threads} {
: SketchContainerImpl{columns_size, max_bins, ft, use_group, n_threads} {
monitor_.Init(__func__);
ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
Expand Down
14 changes: 6 additions & 8 deletions src/common/quantile.h
Expand Up @@ -903,12 +903,11 @@ class HostSketchContainer : public SketchContainerImpl<WQuantileSketch<float, fl
using WQSketch = WQuantileSketch<float, float>;

public:
HostSketchContainer(int32_t max_bins, MetaInfo const &info, std::vector<size_t> columns_size,
bool use_group, int32_t n_threads);
HostSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
std::vector<size_t> columns_size, bool use_group, int32_t n_threads);

template <typename Batch>
void PushAdapterBatch(Batch const &batch, size_t base_rowid, MetaInfo const &info, size_t nnz,
float missing);
void PushAdapterBatch(Batch const &batch, size_t base_rowid, MetaInfo const &info, float missing);
};

/**
Expand Down Expand Up @@ -1000,13 +999,12 @@ class SortedSketchContainer : public SketchContainerImpl<WXQuantileSketch<float,
using Super = SketchContainerImpl<WXQuantileSketch<float, float>>;

public:
explicit SortedSketchContainer(int32_t max_bins, MetaInfo const &info,
explicit SortedSketchContainer(int32_t max_bins, common::Span<FeatureType const> ft,
std::vector<size_t> columns_size, bool use_group,
int32_t n_threads)
: SketchContainerImpl{columns_size, max_bins, info.feature_types.ConstHostSpan(), use_group,
n_threads} {
: SketchContainerImpl{columns_size, max_bins, ft, use_group, n_threads} {
monitor_.Init(__func__);
sketches_.resize(info.num_col_);
sketches_.resize(columns_size.size());
size_t i = 0;
for (auto &sketch : sketches_) {
sketch.sketch = &Super::sketches_[i];
Expand Down
11 changes: 5 additions & 6 deletions src/data/adapter.h
Expand Up @@ -1137,16 +1137,15 @@ class SparsePageAdapterBatch {

public:
struct Line {
SparsePage::Inst inst;
Entry const* inst;
size_t n;
bst_row_t ridx;
COOTuple GetElement(size_t idx) const {
return COOTuple{ridx, inst.data()[idx].index, inst.data()[idx].fvalue};
}
size_t Size() const { return inst.size(); }
COOTuple GetElement(size_t idx) const { return {ridx, inst[idx].index, inst[idx].fvalue}; }
size_t Size() const { return n; }
};

explicit SparsePageAdapterBatch(HostSparsePageView page) : page_{std::move(page)} {}
Line GetLine(size_t ridx) const { return Line{page_[ridx], ridx}; }
Line GetLine(size_t ridx) const { return Line{page_[ridx].data(), page_[ridx].size(), ridx}; }
size_t Size() const { return page_.Size(); }
};
}; // namespace data
Expand Down
15 changes: 9 additions & 6 deletions src/data/device_adapter.cuh
Expand Up @@ -92,9 +92,8 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
*/
class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
public:
explicit CudfAdapter(std::string cuda_interfaces_str) {
Json interfaces =
Json::Load({cuda_interfaces_str.c_str(), cuda_interfaces_str.size()});
explicit CudfAdapter(StringView cuda_interfaces_str) {
Json interfaces = Json::Load(cuda_interfaces_str);
std::vector<Json> const& json_columns = get<Array>(interfaces);
size_t n_columns = json_columns.size();
CHECK_GT(n_columns, 0) << "Number of columns must not equal to 0.";
Expand Down Expand Up @@ -123,6 +122,9 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
columns_ = columns;
batch_ = CudfAdapterBatch(dh::ToSpan(columns_), num_rows_);
}
explicit CudfAdapter(std::string cuda_interfaces_str)
: CudfAdapter{StringView{cuda_interfaces_str}} {}

const CudfAdapterBatch& Value() const override {
CHECK_EQ(batch_.columns_.data(), columns_.data().get());
return batch_;
Expand Down Expand Up @@ -163,9 +165,8 @@ class CupyAdapterBatch : public detail::NoMetaInfo {

class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
public:
explicit CupyAdapter(std::string cuda_interface_str) {
Json json_array_interface =
Json::Load({cuda_interface_str.c_str(), cuda_interface_str.size()});
explicit CupyAdapter(StringView cuda_interface_str) {
Json json_array_interface = Json::Load(cuda_interface_str);
array_interface_ = ArrayInterface<2>(get<Object const>(json_array_interface));
batch_ = CupyAdapterBatch(array_interface_);
if (array_interface_.Shape(0) == 0) {
Expand All @@ -174,6 +175,8 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
CHECK_NE(device_idx_, -1);
}
explicit CupyAdapter(std::string cuda_interface_str)
: CupyAdapter{StringView{cuda_interface_str}} {}
const CupyAdapterBatch& Value() const override { return batch_; }

size_t NumRows() const { return array_interface_.Shape(0); }
Expand Down
8 changes: 4 additions & 4 deletions src/data/proxy_dmatrix.cu
Expand Up @@ -7,8 +7,8 @@
namespace xgboost {
namespace data {

void DMatrixProxy::FromCudaColumnar(std::string interface_str) {
std::shared_ptr<data::CudfAdapter> adapter {new data::CudfAdapter(interface_str)};
void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
std::shared_ptr<data::CudfAdapter> adapter{new CudfAdapter{interface_str}};
auto const& value = adapter->Value();
this->batch_ = adapter;
ctx_.gpu_id = adapter->DeviceIdx();
Expand All @@ -19,8 +19,8 @@ void DMatrixProxy::FromCudaColumnar(std::string interface_str) {
}
}

void DMatrixProxy::FromCudaArray(std::string interface_str) {
std::shared_ptr<CupyAdapter> adapter(new CupyAdapter(interface_str));
void DMatrixProxy::FromCudaArray(StringView interface_str) {
std::shared_ptr<CupyAdapter> adapter(new CupyAdapter{StringView{interface_str}});
this->batch_ = adapter;
ctx_.gpu_id = adapter->DeviceIdx();
this->Info().num_col_ = adapter->NumColumns();
Expand Down
16 changes: 8 additions & 8 deletions src/data/proxy_dmatrix.h
Expand Up @@ -48,8 +48,8 @@ class DMatrixProxy : public DMatrix {
Context ctx_;

#if defined(XGBOOST_USE_CUDA)
void FromCudaColumnar(std::string interface_str);
void FromCudaArray(std::string interface_str);
void FromCudaColumnar(StringView interface_str);
void FromCudaArray(StringView interface_str);
#endif // defined(XGBOOST_USE_CUDA)

public:
Expand All @@ -58,9 +58,8 @@ class DMatrixProxy : public DMatrix {
void SetCUDAArray(char const* c_interface) {
common::AssertGPUSupport();
#if defined(XGBOOST_USE_CUDA)
std::string interface_str = c_interface;
Json json_array_interface =
Json::Load({interface_str.c_str(), interface_str.size()});
StringView interface_str{c_interface};
Json json_array_interface = Json::Load(interface_str);
if (IsA<Array>(json_array_interface)) {
this->FromCudaColumnar(interface_str);
} else {
Expand Down Expand Up @@ -114,10 +113,11 @@ class DMatrixProxy : public DMatrix {
}
};

inline DMatrixProxy *MakeProxy(DMatrixHandle proxy) {
auto proxy_handle = static_cast<std::shared_ptr<DMatrix> *>(proxy);
inline DMatrixProxy* MakeProxy(DMatrixHandle proxy) {
auto proxy_handle = static_cast<std::shared_ptr<DMatrix>*>(proxy);
CHECK(proxy_handle) << "Invalid proxy handle.";
DMatrixProxy *typed = static_cast<DMatrixProxy *>(proxy_handle->get());
DMatrixProxy* typed = static_cast<DMatrixProxy*>(proxy_handle->get());
CHECK(typed) << "Invalid proxy handle.";
return typed;
}

Expand Down
8 changes: 4 additions & 4 deletions tests/cpp/common/test_quantile.cc
Expand Up @@ -82,8 +82,8 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
std::vector<float> hessian(rows, 1.0);
auto hess = Span<float const>{hessian};

ContainerType<use_column> sketch_distributed(n_bins, m->Info(), column_size, false,
OmpGetNumThreads(0));
ContainerType<use_column> sketch_distributed(n_bins, m->Info().feature_types.ConstHostSpan(),
column_size, false, OmpGetNumThreads(0));

if (use_column) {
for (auto const& page : m->GetBatches<SortedCSCPage>()) {
Expand All @@ -103,8 +103,8 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
CHECK_EQ(rabit::GetWorldSize(), 1);
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
m->Info().num_row_ = world * rows;
ContainerType<use_column> sketch_on_single_node(n_bins, m->Info(), column_size, false,
OmpGetNumThreads(0));
ContainerType<use_column> sketch_on_single_node(n_bins, m->Info().feature_types.ConstHostSpan(),
column_size, false, OmpGetNumThreads(0));
m->Info().num_row_ = rows;

for (auto rank = 0; rank < world; ++rank) {
Expand Down