Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Loop over reduce. #6229

Merged
merged 1 commit into from Oct 13, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/tutorials/saving_model.rst
Expand Up @@ -167,7 +167,7 @@ or in R:
Will print out something similiar to (not actual output as it's too long for demonstration):

.. code-block:: json
.. code-block:: javascript
{
"Learner": {
Expand Down
2 changes: 2 additions & 0 deletions python-package/xgboost/core.py
Expand Up @@ -871,6 +871,8 @@ class DeviceQuantileDMatrix(DMatrix):
.. versionadded:: 1.1.0
Known limitation:
The data size (rows * cols) can not exceed 2 ** 31 - 1000
"""

def __init__(self, data, label=None, weight=None, # pylint: disable=W0231
Expand Down
4 changes: 4 additions & 0 deletions python-package/xgboost/dask.py
Expand Up @@ -509,6 +509,10 @@ class DaskDeviceQuantileDMatrix(DaskDMatrix):
max_bin: Number of bins for histogram construction.


Know issue:
The size of each chunk (rows * cols for a single dask chunk/partition) can
not exceed 2 ** 31 - 1000

'''
def __init__(self, client,
data,
Expand Down
17 changes: 17 additions & 0 deletions src/common/device_helpers.cuh
Expand Up @@ -1107,4 +1107,21 @@ size_t SegmentedUnique(Inputs &&...inputs) {
dh::XGBCachingDeviceAllocator<char> alloc;
return SegmentedUnique(thrust::cuda::par(alloc), std::forward<Inputs&&>(inputs)...);
}

template <typename Policy, typename InputIt, typename Init, typename Func>
auto Reduce(Policy policy, InputIt first, InputIt second, Init init, Func reduce_op) {
size_t constexpr kLimit = std::numeric_limits<int32_t>::max() / 2;
size_t size = std::distance(first, second);
using Ty = std::remove_cv_t<Init>;
Ty aggregate = init;
for (size_t offset = 0; offset < size; offset += kLimit) {
auto begin_it = first + offset;
auto end_it = first + std::min(offset + kLimit, size);
size_t batch_size = std::distance(begin_it, end_it);
CHECK_LE(batch_size, size);
auto ret = thrust::reduce(policy, begin_it, end_it, init, reduce_op);
aggregate = reduce_op(aggregate, ret);
}
return aggregate;
}
} // namespace dh
2 changes: 1 addition & 1 deletion src/data/device_adapter.cuh
Expand Up @@ -221,7 +221,7 @@ size_t GetRowCounts(const AdapterBatchT batch, common::Span<size_t> offset,
}
});
dh::XGBCachingDeviceAllocator<char> alloc;
size_t row_stride = thrust::reduce(
size_t row_stride = dh::Reduce(
thrust::cuda::par(alloc), thrust::device_pointer_cast(offset.data()),
thrust::device_pointer_cast(offset.data()) + offset.size(), size_t(0),
thrust::maximum<size_t>());
Expand Down
8 changes: 8 additions & 0 deletions src/data/ellpack_page.cu
Expand Up @@ -206,6 +206,14 @@ void CopyDataToEllpack(const AdapterBatchT& batch, EllpackPageImpl* dst,
WriteCompressedEllpackFunctor<AdapterBatchT>, decltype(discard)>
out(discard, functor);
dh::XGBCachingDeviceAllocator<char> alloc;
// 1000 as a safe factor for inclusive_scan, otherwise it might generate overflow and
// lead to oom error.
// or:
// after reduction step 2: cudaErrorInvalidConfiguration: invalid configuration argument
// https://github.com/NVIDIA/thrust/issues/1299
CHECK_LE(batch.Size(), std::numeric_limits<int32_t>::max() - 1000)
<< "Known limitation, size (rows * cols) of quantile based DMatrix "
"cannot exceed the limit of 32-bit integer.";
thrust::inclusive_scan(thrust::cuda::par(alloc), key_value_index_iter,
key_value_index_iter + batch.Size(), out,
[=] __device__(Tuple a, Tuple b) {
Expand Down
4 changes: 2 additions & 2 deletions src/tree/gpu_hist/histogram.cu
Expand Up @@ -53,7 +53,7 @@ struct Pair {
GradientPair first;
GradientPair second;
};
XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
__host__ XGBOOST_DEV_INLINE Pair operator+(Pair const& lhs, Pair const& rhs) {
return {lhs.first + rhs.first, lhs.second + rhs.second};
}
} // anonymous namespace
Expand Down Expand Up @@ -86,7 +86,7 @@ GradientSumT CreateRoundingFactor(common::Span<GradientPair const> gpair) {
thrust::device_ptr<GradientPair const> gpair_end {gpair.data() + gpair.size()};
auto beg = thrust::make_transform_iterator(gpair_beg, Clip());
auto end = thrust::make_transform_iterator(gpair_end, Clip());
Pair p = thrust::reduce(thrust::cuda::par(alloc), beg, end, Pair{});
Pair p = dh::Reduce(thrust::cuda::par(alloc), beg, end, Pair{}, thrust::plus<Pair>{});
GradientPair positive_sum {p.first}, negative_sum {p.second};

auto histogram_rounding = GradientSumT {
Expand Down
5 changes: 3 additions & 2 deletions src/tree/updater_gpu_hist.cu
Expand Up @@ -642,10 +642,11 @@ struct GPUHistMakerDevice {
ExpandEntry InitRoot(RegTree* p_tree, dh::AllReducer* reducer) {
constexpr bst_node_t kRootNIdx = 0;
dh::XGBCachingDeviceAllocator<char> alloc;
GradientPair root_sum = thrust::reduce(
GradientPair root_sum = dh::Reduce(
thrust::cuda::par(alloc),
thrust::device_ptr<GradientPair const>(gpair.data()),
thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()));
thrust::device_ptr<GradientPair const>(gpair.data() + gpair.size()),
GradientPair{}, thrust::plus<GradientPair>{});
rabit::Allreduce<rabit::op::Sum, float>(reinterpret_cast<float*>(&root_sum),
2);

Expand Down
11 changes: 9 additions & 2 deletions tests/cpp/common/test_device_helpers.cu
@@ -1,4 +1,3 @@

/*!
* Copyright 2017 XGBoost contributors
*/
Expand Down Expand Up @@ -122,6 +121,14 @@ void TestSegmentedUniqueRegression(std::vector<SketchEntry> values, size_t n_dup
ASSERT_EQ(segments.at(1), d_segments_out[1] + n_duplicated);
}

TEST(DeviceHelpers, Reduce) {
size_t kSize = std::numeric_limits<uint32_t>::max();
auto it = thrust::make_counting_iterator(0ul);
dh::XGBCachingDeviceAllocator<char> alloc;
auto batched = dh::Reduce(thrust::cuda::par(alloc), it, it + kSize, 0ul, thrust::maximum<size_t>{});
CHECK_EQ(batched, kSize - 1);
}


TEST(SegmentedUnique, Regression) {
{
Expand Down Expand Up @@ -157,4 +164,4 @@ TEST(SegmentedUnique, Regression) {
}
}
} // namespace common
} // namespace xgboost
} // namespace xgboost
1 change: 0 additions & 1 deletion tests/cpp/data/test_ellpack_page.cu
Expand Up @@ -234,5 +234,4 @@ TEST(EllpackPage, Compact) {
}
}
}

} // namespace xgboost