From 11d65fcb21b5742b53b7d032f1e4b3a2ba01544c Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 13 May 2022 14:30:35 +0800 Subject: [PATCH] Extract partial sum into an independent function. (#7889) --- src/common/common.h | 18 ------ src/common/numeric.h | 96 ++++++++++++++++++++++++++++++++ src/data/data.cc | 1 + src/data/gradient_index.cc | 56 ++----------------- src/objective/adaptive.cc | 1 + tests/cpp/common/test_numeric.cc | 33 +++++++++++ tests/cpp/tree/test_approx.cc | 1 + 7 files changed, 138 insertions(+), 68 deletions(-) create mode 100644 src/common/numeric.h create mode 100644 tests/cpp/common/test_numeric.cc diff --git a/src/common/common.h b/src/common/common.h index 4949d61e4582..1f36f3d8d687 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -274,24 +274,6 @@ template XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) { return indptr[group + 1] - 1; } - -/** - * \brief Run length encode on CPU, input must be sorted. - */ -template -void RunLengthEncode(Iter begin, Iter end, std::vector *p_out) { - auto &out = *p_out; - out = std::vector{0}; - size_t n = std::distance(begin, end); - for (size_t i = 1; i < n; ++i) { - if (begin[i] != begin[i - 1]) { - out.push_back(i); - } - } - if (out.back() != n) { - out.push_back(n); - } -} } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_COMMON_H_ diff --git a/src/common/numeric.h b/src/common/numeric.h new file mode 100644 index 000000000000..ff5ac2242033 --- /dev/null +++ b/src/common/numeric.h @@ -0,0 +1,96 @@ +/*! + * Copyright 2022, XGBoost contributors. + */ +#ifndef XGBOOST_COMMON_NUMERIC_H_ +#define XGBOOST_COMMON_NUMERIC_H_ + +#include // std::max +#include // std::iterator_traits +#include + +#include "threading_utils.h" +#include "xgboost/generic_parameters.h" + +namespace xgboost { +namespace common { + +/** + * \brief Run length encode on CPU, input must be sorted. + */ +template +void RunLengthEncode(Iter begin, Iter end, std::vector *p_out) { + auto &out = *p_out; + out = std::vector{0}; + size_t n = std::distance(begin, end); + for (size_t i = 1; i < n; ++i) { + if (begin[i] != begin[i - 1]) { + out.push_back(i); + } + } + if (out.back() != n) { + out.push_back(n); + } +} + +/** + * \brief Varient of std::partial_sum, out_it should point to a container that has n + 1 + * elements. Useful for constructing a CSR indptr. + */ +template +void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) { + static_assert(std::is_same::value_type>::value, ""); + static_assert(std::is_same::value_type>::value, ""); + // The number of threads is pegged to the batch size. If the OMP block is parallelized + // on anything other than the batch/block size, it should be reassigned + auto n = static_cast(std::distance(begin, end)); + const size_t batch_threads = + std::max(static_cast(1), std::min(n, static_cast(n_threads))); + common::MemStackAllocator partial_sums(batch_threads); + + size_t block_size = n / batch_threads; + + dmlc::OMPException exc; +#pragma omp parallel num_threads(batch_threads) + { +#pragma omp for + for (omp_ulong tid = 0; tid < batch_threads; ++tid) { + exc.Run([&]() { + size_t ibegin = block_size * tid; + size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1))); + + T running_sum = 0; + for (size_t ridx = ibegin; ridx < iend; ++ridx) { + running_sum += *(begin + ridx); + *(out_it + 1 + ridx) = running_sum; + } + }); + } + +#pragma omp single + { + exc.Run([&]() { + partial_sums[0] = init; + for (size_t i = 1; i < batch_threads; ++i) { + partial_sums[i] = partial_sums[i - 1] + *(out_it + i * block_size); + } + }); + } + +#pragma omp for + for (omp_ulong tid = 0; tid < batch_threads; ++tid) { + exc.Run([&]() { + size_t ibegin = block_size * tid; + size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1))); + + for (size_t i = ibegin; i < iend; ++i) { + *(out_it + 1 + i) += partial_sums[tid]; + } + }); + } + } + exc.Rethrow(); +} +} // namespace common +} // namespace xgboost + +#endif // XGBOOST_COMMON_NUMERIC_H_ diff --git a/src/data/data.cc b/src/data/data.cc index c297527c6bae..db952fc3a8fd 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -21,6 +21,7 @@ #include "../common/io.h" #include "../common/linalg_op.h" #include "../common/math.h" +#include "../common/numeric.h" #include "../common/version.h" #include "../common/group_data.h" #include "../common/threading_utils.h" diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc index 907a8c45c382..791bb47e7105 100644 --- a/src/data/gradient_index.cc +++ b/src/data/gradient_index.cc @@ -10,6 +10,7 @@ #include "../common/column_matrix.h" #include "../common/hist_util.h" +#include "../common/numeric.h" #include "../common/threading_utils.h" namespace xgboost { @@ -28,58 +29,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch, common::Span ft, size_t rbegin, size_t prev_sum, uint32_t nbins, int32_t n_threads) { - // The number of threads is pegged to the batch size. If the OMP - // block is parallelized on anything other than the batch/block size, - // it should be reassigned + auto page = batch.GetView(); + auto it = common::MakeIndexTransformIter([&](size_t ridx) { return page[ridx].size(); }); + common::PartialSum(n_threads, it, it + page.Size(), prev_sum, row_ptr.begin() + rbegin); + // The number of threads is pegged to the batch size. If the OMP block is parallelized + // on anything other than the batch/block size, it should be reassigned const size_t batch_threads = std::max(static_cast(1), std::min(batch.Size(), static_cast(n_threads))); - auto page = batch.GetView(); - common::MemStackAllocator partial_sums(batch_threads); - - size_t block_size = batch.Size() / batch_threads; - - dmlc::OMPException exc; -#pragma omp parallel num_threads(batch_threads) - { -#pragma omp for - for (omp_ulong tid = 0; tid < batch_threads; ++tid) { - exc.Run([&]() { - size_t ibegin = block_size * tid; - size_t iend = (tid == (batch_threads - 1) ? batch.Size() - : (block_size * (tid + 1))); - - size_t running_sum = 0; - for (size_t ridx = ibegin; ridx < iend; ++ridx) { - running_sum += page[ridx].size(); - row_ptr[rbegin + 1 + ridx] = running_sum; - } - }); - } - -#pragma omp single - { - exc.Run([&]() { - partial_sums[0] = prev_sum; - for (size_t i = 1; i < batch_threads; ++i) { - partial_sums[i] = partial_sums[i - 1] + row_ptr[rbegin + i * block_size]; - } - }); - } - -#pragma omp for - for (omp_ulong tid = 0; tid < batch_threads; ++tid) { - exc.Run([&]() { - size_t ibegin = block_size * tid; - size_t iend = (tid == (batch_threads - 1) ? batch.Size() - : (block_size * (tid + 1))); - - for (size_t i = ibegin; i < iend; ++i) { - row_ptr[rbegin + 1 + i] += partial_sums[tid]; - } - }); - } - } - exc.Rethrow(); const size_t n_index = row_ptr[rbegin + batch.Size()]; // number of entries in this page ResizeIndex(n_index, isDense_); diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc index f2675d918bdf..43dc36600013 100644 --- a/src/objective/adaptive.cc +++ b/src/objective/adaptive.cc @@ -7,6 +7,7 @@ #include #include "../common/common.h" +#include "../common/numeric.h" #include "../common/stats.h" #include "../common/threading_utils.h" #include "xgboost/tree_model.h" diff --git a/tests/cpp/common/test_numeric.cc b/tests/cpp/common/test_numeric.cc new file mode 100644 index 000000000000..5b672585031b --- /dev/null +++ b/tests/cpp/common/test_numeric.cc @@ -0,0 +1,33 @@ +/*! + * Copyright 2022, XGBoost contributors. + */ +#include + +#include + +#include "../../../src/common/numeric.h" + +namespace xgboost { +namespace common { +TEST(Numeric, PartialSum) { + { + std::vector values{1, 2, 3, 4}; + std::vector result(values.size() + 1); + Context ctx; + PartialSum(ctx.Threads(), values.begin(), values.end(), static_cast(0), result.begin()); + std::vector sol(values.size() + 1, 0); + std::partial_sum(values.begin(), values.end(), sol.begin() + 1); + ASSERT_EQ(sol, result); + } + { + std::vector values{1.5, 2.5, 3.5, 4.5}; + std::vector result(values.size() + 1); + Context ctx; + PartialSum(ctx.Threads(), values.begin(), values.end(), 0.0, result.begin()); + std::vector sol(values.size() + 1, 0.0); + std::partial_sum(values.begin(), values.end(), sol.begin() + 1); + ASSERT_EQ(sol, result); + } +} +} // namespace common +} // namespace xgboost diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc index 2e2fd4a0b3d7..0ae23557f922 100644 --- a/tests/cpp/tree/test_approx.cc +++ b/tests/cpp/tree/test_approx.cc @@ -3,6 +3,7 @@ */ #include +#include "../../../src/common/numeric.h" #include "../../../src/tree/updater_approx.h" #include "../helpers.h" #include "test_partitioner.h"