Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch UpdatePosition using cudaMemcpy #7964

Merged
merged 72 commits into from Jun 30, 2022
Merged
Show file tree
Hide file tree
Changes from 71 commits
Commits
Show all changes
72 commits
Select commit Hold shift + click to select a range
2b4cf67
Remove single_precision_histogram
RAMitchell Apr 21, 2022
f140ebc
Batch nodes from driver
RAMitchell Apr 25, 2022
80a3e78
Categoricals broken
RAMitchell Apr 29, 2022
e1fb702
Refactor categoricals
RAMitchell May 1, 2022
dc100cf
Refactor categoricals 2
RAMitchell May 2, 2022
bc74458
Skip copy if no categoricals
RAMitchell May 2, 2022
c4f8eac
Review comment
RAMitchell May 5, 2022
2a53849
Merge branch 'master' of github.com:dmlc/xgboost into categorical
RAMitchell May 5, 2022
a1cddaa
Revert "Categoricals broken"
RAMitchell May 5, 2022
829bda6
Merge branch 'master' of github.com:dmlc/xgboost into fuse
RAMitchell May 5, 2022
0bc8745
Merge branch 'categorical' of github.com:RAMitchell/xgboost into fuse
RAMitchell May 5, 2022
fd0e25e
Lint
RAMitchell May 5, 2022
9fab64e
Merge branch 'master' of github.com:dmlc/xgboost into fuse
RAMitchell May 5, 2022
56785f3
Revert "Revert "Categoricals broken""
RAMitchell May 6, 2022
1dd1a6c
Limit concurrent nodes
RAMitchell May 10, 2022
8751d14
Lint
RAMitchell May 11, 2022
49809bf
Basic blockwise partitioning
RAMitchell May 11, 2022
181d7cf
Working block partition
RAMitchell May 12, 2022
666eb9b
Reduction
RAMitchell May 12, 2022
66173c7
Some failing tests
RAMitchell May 13, 2022
ec7fea8
Handle empty candidate
RAMitchell May 13, 2022
49c5f90
Cleanup
RAMitchell May 13, 2022
bd48082
experiments
RAMitchell May 14, 2022
c3ef1f6
Improvements
RAMitchell May 14, 2022
ba8bbdf
Fused scan
RAMitchell May 14, 2022
f4ef4ca
Register blocking
RAMitchell May 15, 2022
9c27dd0
Cleanup
RAMitchell May 17, 2022
0bcc84a
Working tests
RAMitchell May 18, 2022
723ff47
Transplanted new code
RAMitchell May 18, 2022
199bed9
Optimised
RAMitchell May 19, 2022
0e35e99
Do not initialise data structures to maximum possible tree size.
RAMitchell May 19, 2022
daa9b56
Comments, cleanup
RAMitchell May 19, 2022
8ab989e
Refactor FinalizePosition
RAMitchell May 20, 2022
d50ec4b
Remove redundant functions
RAMitchell May 20, 2022
c34c3ad
Lint
RAMitchell May 20, 2022
e534edc
Merge branch 'master' of github.com:dmlc/xgboost into batch-position-…
RAMitchell May 20, 2022
47bfc6e
Remove old kernel
RAMitchell May 20, 2022
a53ba87
Add tests for AtomicIncrement
RAMitchell May 20, 2022
7450d68
Change lambda to kernel
RAMitchell May 23, 2022
6df1259
Smem + lineinfo
RAMitchell May 24, 2022
4010942
Use stream
RAMitchell May 24, 2022
1b13fe6
Fast global stores
RAMitchell May 25, 2022
24fb339
Fast load without shmem
RAMitchell May 27, 2022
f40fe94
Memcpy version
RAMitchell May 30, 2022
7d5d7e7
Remove left counts kernel
RAMitchell May 30, 2022
77f8550
Unstable partition
RAMitchell May 31, 2022
14d8663
Warp aggregates
RAMitchell May 31, 2022
ec968f7
Cleanup
RAMitchell Jun 1, 2022
a764986
Use pointer for shared memory
RAMitchell Jun 1, 2022
001c2f2
Row partitioner grid
RAMitchell Jun 2, 2022
70bad86
Custom FinalizePositionKernel
RAMitchell Jun 2, 2022
31e02f0
Revert "Custom FinalizePositionKernel"
RAMitchell Jun 2, 2022
b86cb29
Reduce grid size
RAMitchell Jun 2, 2022
c3944af
Tune items/thread
RAMitchell Jun 4, 2022
cdd134a
FinalisePosition custom kernel
RAMitchell Jun 6, 2022
edabc45
Fixing slow scatter
RAMitchell Jun 7, 2022
43eb83e
Remove unstable
RAMitchell Jun 13, 2022
d87e366
Merge branch 'master' of github.com:dmlc/xgboost into batch-position-…
RAMitchell Jun 13, 2022
968bb29
Format
RAMitchell Jun 14, 2022
1372ad8
Review comments
RAMitchell Jun 15, 2022
a910fb9
Reintroduce prediction caching for external memory.
RAMitchell Jun 17, 2022
ff05df5
Avoid initialising temp memory
RAMitchell Jun 19, 2022
c3a0e32
Merge branch 'master' of github.com:dmlc/xgboost into batch-position-…
RAMitchell Jun 23, 2022
0280b8c
Lint
RAMitchell Jun 23, 2022
9c642dc
Review comments.
RAMitchell Jun 24, 2022
b4f2128
Remove external memory prediction caching.
RAMitchell Jun 27, 2022
8caed98
Merge branch 'master' of github.com:dmlc/xgboost into batch-position-…
RAMitchell Jun 28, 2022
776ef9f
Remove constant memory in favour of __ldg().
RAMitchell Jun 28, 2022
33fea3d
Clang tidy
RAMitchell Jun 28, 2022
9de0692
Clang tidy
RAMitchell Jun 28, 2022
3cd5e41
Review comments.
RAMitchell Jun 29, 2022
9eddfce
Initialise memory in case zero training rows.
RAMitchell Jun 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
21 changes: 21 additions & 0 deletions src/common/device_helpers.cuh
Expand Up @@ -1939,4 +1939,25 @@ class CUDAStream {
CUDAStreamView View() const { return CUDAStreamView{stream_}; }
void Sync() { this->View().Sync(); }
};

// Force nvcc to load data as constant
template <typename T>
class LDGIterator {
using DeviceWordT = typename cub::UnitWord<T>::DeviceWord;
static constexpr std::size_t kNumWords = sizeof(T) / sizeof(DeviceWordT);

const T *ptr_;

public:
explicit LDGIterator(const T *ptr) : ptr_(ptr) {}
__device__ T operator[](std::size_t idx) const {
DeviceWordT tmp[kNumWords];
static_assert(sizeof(tmp) == sizeof(T), "Expect sizes to be equal.");
#pragma unroll
for (int i = 0; i < kNumWords; i++) {
tmp[i] = __ldg(reinterpret_cast<const DeviceWordT *>(ptr_ + idx) + i);
RAMitchell marked this conversation as resolved.
Show resolved Hide resolved
}
return *reinterpret_cast<const T *>(tmp);
}
};
} // namespace dh
159 changes: 17 additions & 142 deletions src/tree/gpu_hist/row_partitioner.cu
@@ -1,174 +1,49 @@
/*!
* Copyright 2017-2021 XGBoost contributors
* Copyright 2017-2022 XGBoost contributors
*/
#include <thrust/iterator/discard_iterator.h>
#include <thrust/iterator/transform_output_iterator.h>
#include <thrust/sequence.h>

#include <vector>

#include "../../common/device_helpers.cuh"
#include "row_partitioner.cuh"

namespace xgboost {
namespace tree {
struct IndexFlagTuple {
size_t idx;
size_t flag;
};

struct IndexFlagOp {
__device__ IndexFlagTuple operator()(const IndexFlagTuple& a,
const IndexFlagTuple& b) const {
return {b.idx, a.flag + b.flag};
}
};

struct WriteResultsFunctor {
bst_node_t left_nidx;
common::Span<bst_node_t> position_in;
common::Span<bst_node_t> position_out;
common::Span<RowPartitioner::RowIndexT> ridx_in;
common::Span<RowPartitioner::RowIndexT> ridx_out;
int64_t* d_left_count;

__device__ IndexFlagTuple operator()(const IndexFlagTuple& x) {
// the ex_scan_result represents how many rows have been assigned to left
// node so far during scan.
int scatter_address;
if (position_in[x.idx] == left_nidx) {
scatter_address = x.flag - 1; // -1 because inclusive scan
} else {
// current number of rows belong to right node + total number of rows
// belong to left node
scatter_address = (x.idx - x.flag) + *d_left_count;
}
// copy the node id to output
position_out[scatter_address] = position_in[x.idx];
ridx_out[scatter_address] = ridx_in[x.idx];

// Discard
return {};
}
};

// Implement partitioning via single scan operation using transform output to
// write the result
void RowPartitioner::SortPosition(common::Span<bst_node_t> position,
common::Span<bst_node_t> position_out,
common::Span<RowIndexT> ridx,
common::Span<RowIndexT> ridx_out,
bst_node_t left_nidx, bst_node_t,
int64_t* d_left_count, cudaStream_t stream) {
WriteResultsFunctor write_results{left_nidx, position, position_out,
ridx, ridx_out, d_left_count};
auto discard_write_iterator =
thrust::make_transform_output_iterator(dh::TypedDiscard<IndexFlagTuple>(), write_results);
auto counting = thrust::make_counting_iterator(0llu);
auto input_iterator = dh::MakeTransformIterator<IndexFlagTuple>(
counting, [=] __device__(size_t idx) {
return IndexFlagTuple{idx, static_cast<size_t>(position[idx] == left_nidx)};
});
size_t temp_bytes = 0;
cub::DeviceScan::InclusiveScan(nullptr, temp_bytes, input_iterator,
discard_write_iterator, IndexFlagOp(),
position.size(), stream);
dh::TemporaryArray<int8_t> temp(temp_bytes);
cub::DeviceScan::InclusiveScan(temp.data().get(), temp_bytes, input_iterator,
discard_write_iterator, IndexFlagOp(),
position.size(), stream);
}

void Reset(int device_idx, common::Span<RowPartitioner::RowIndexT> ridx,
common::Span<bst_node_t> position) {
dh::safe_cuda(cudaSetDevice(device_idx));
CHECK_EQ(ridx.size(), position.size());
dh::LaunchN(ridx.size(), [=] __device__(size_t idx) {
ridx[idx] = idx;
position[idx] = 0;
});
}

RowPartitioner::RowPartitioner(int device_idx, size_t num_rows)
: device_idx_(device_idx), ridx_a_(num_rows), position_a_(num_rows),
ridx_b_(num_rows), position_b_(num_rows) {
: device_idx_(device_idx),
ridx_(num_rows),
ridx_tmp_(num_rows),
d_counts_(kMaxUpdatePositionBatchSize) {
dh::safe_cuda(cudaSetDevice(device_idx_));
ridx_ = dh::DoubleBuffer<RowIndexT>{&ridx_a_, &ridx_b_};
position_ = dh::DoubleBuffer<bst_node_t>{&position_a_, &position_b_};
ridx_segments_.emplace_back(static_cast<size_t>(0), num_rows);

Reset(device_idx, ridx_.CurrentSpan(), position_.CurrentSpan());
left_counts_.resize(256);
thrust::fill(left_counts_.begin(), left_counts_.end(), 0);
streams_.resize(2);
for (auto& stream : streams_) {
dh::safe_cuda(cudaStreamCreate(&stream));
}
ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
dh::safe_cuda(cudaStreamCreate(&stream_));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we use dh::CUDAStream instead?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see this class is using non-blocking streams with respect to the default stream. Directly swapping it results in a crash - there is the assumption in many places that kernels running on the default stream wait for previous kernels to finish.

RAMitchell marked this conversation as resolved.
Show resolved Hide resolved
}

RowPartitioner::~RowPartitioner() {
dh::safe_cuda(cudaSetDevice(device_idx_));
for (auto& stream : streams_) {
dh::safe_cuda(cudaStreamDestroy(stream));
}
dh::safe_cuda(cudaStreamDestroy(stream_));
}

common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(
bst_node_t nidx) {
auto segment = ridx_segments_.at(nidx);
// Return empty span here as a valid result
// Will error if we try to construct a span from a pointer with size 0
if (segment.Size() == 0) {
return {};
}
return ridx_.CurrentSpan().subspan(segment.begin, segment.Size());
common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
auto segment = ridx_segments_.at(nidx).segment;
return dh::ToSpan(ridx_).subspan(segment.begin, segment.Size());
}

common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows() {
return ridx_.CurrentSpan();
return dh::ToSpan(ridx_);
}

common::Span<const bst_node_t> RowPartitioner::GetPosition() {
return position_.CurrentSpan();
}
std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(
bst_node_t nidx) {
std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(bst_node_t nidx) {
auto span = GetRows(nidx);
std::vector<RowIndexT> rows(span.size());
dh::CopyDeviceSpanToVector(&rows, span);
return rows;
}

std::vector<bst_node_t> RowPartitioner::GetPositionHost() {
auto span = GetPosition();
std::vector<bst_node_t> position(span.size());
dh::CopyDeviceSpanToVector(&position, span);
return position;
}

void RowPartitioner::SortPositionAndCopy(const Segment& segment,
bst_node_t left_nidx,
bst_node_t right_nidx,
int64_t* d_left_count,
cudaStream_t stream) {
SortPosition(
// position_in
common::Span<bst_node_t>(position_.Current() + segment.begin,
segment.Size()),
// position_out
common::Span<bst_node_t>(position_.Other() + segment.begin,
segment.Size()),
// row index in
common::Span<RowIndexT>(ridx_.Current() + segment.begin, segment.Size()),
// row index out
common::Span<RowIndexT>(ridx_.Other() + segment.begin, segment.Size()),
left_nidx, right_nidx, d_left_count, stream);
// Copy back key/value
const auto d_position_current = position_.Current() + segment.begin;
const auto d_position_other = position_.Other() + segment.begin;
const auto d_ridx_current = ridx_.Current() + segment.begin;
const auto d_ridx_other = ridx_.Other() + segment.begin;
dh::LaunchN(segment.Size(), stream, [=] __device__(size_t idx) {
d_position_current[idx] = d_position_other[idx];
d_ridx_current[idx] = d_ridx_other[idx];
});
}
}; // namespace tree
}; // namespace xgboost