Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extract partial sum into an independent function. #7889

Merged
merged 2 commits into from May 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 0 additions & 18 deletions src/common/common.h
Expand Up @@ -274,24 +274,6 @@ template <typename Indexable>
XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
return indptr[group + 1] - 1;
}

/**
* \brief Run length encode on CPU, input must be sorted.
*/
template <typename Iter, typename Idx>
void RunLengthEncode(Iter begin, Iter end, std::vector<Idx> *p_out) {
auto &out = *p_out;
out = std::vector<Idx>{0};
size_t n = std::distance(begin, end);
for (size_t i = 1; i < n; ++i) {
if (begin[i] != begin[i - 1]) {
out.push_back(i);
}
}
if (out.back() != n) {
out.push_back(n);
}
}
} // namespace common
} // namespace xgboost
#endif // XGBOOST_COMMON_COMMON_H_
96 changes: 96 additions & 0 deletions src/common/numeric.h
@@ -0,0 +1,96 @@
/*!
* Copyright 2022, XGBoost contributors.
*/
#ifndef XGBOOST_COMMON_NUMERIC_H_
#define XGBOOST_COMMON_NUMERIC_H_

#include <algorithm> // std::max
#include <iterator> // std::iterator_traits
#include <vector>

#include "threading_utils.h"
#include "xgboost/generic_parameters.h"

namespace xgboost {
namespace common {

/**
* \brief Run length encode on CPU, input must be sorted.
*/
template <typename Iter, typename Idx>
void RunLengthEncode(Iter begin, Iter end, std::vector<Idx> *p_out) {
auto &out = *p_out;
out = std::vector<Idx>{0};
size_t n = std::distance(begin, end);
for (size_t i = 1; i < n; ++i) {
if (begin[i] != begin[i - 1]) {
out.push_back(i);
}
}
if (out.back() != n) {
out.push_back(n);
}
}

/**
* \brief Varient of std::partial_sum, out_it should point to a container that has n + 1
* elements. Useful for constructing a CSR indptr.
*/
template <typename InIt, typename OutIt, typename T>
void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
// The number of threads is pegged to the batch size. If the OMP block is parallelized
// on anything other than the batch/block size, it should be reassigned
auto n = static_cast<size_t>(std::distance(begin, end));
const size_t batch_threads =
std::max(static_cast<size_t>(1), std::min(n, static_cast<size_t>(n_threads)));
common::MemStackAllocator<T, 128> partial_sums(batch_threads);

size_t block_size = n / batch_threads;

dmlc::OMPException exc;
#pragma omp parallel num_threads(batch_threads)
{
#pragma omp for
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
exc.Run([&]() {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1)));

T running_sum = 0;
for (size_t ridx = ibegin; ridx < iend; ++ridx) {
running_sum += *(begin + ridx);
*(out_it + 1 + ridx) = running_sum;
}
});
}

#pragma omp single
{
exc.Run([&]() {
partial_sums[0] = init;
for (size_t i = 1; i < batch_threads; ++i) {
partial_sums[i] = partial_sums[i - 1] + *(out_it + i * block_size);
}
});
}

#pragma omp for
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
exc.Run([&]() {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1)));

for (size_t i = ibegin; i < iend; ++i) {
*(out_it + 1 + i) += partial_sums[tid];
}
});
}
}
exc.Rethrow();
}
} // namespace common
} // namespace xgboost

#endif // XGBOOST_COMMON_NUMERIC_H_
1 change: 1 addition & 0 deletions src/data/data.cc
Expand Up @@ -21,6 +21,7 @@
#include "../common/io.h"
#include "../common/linalg_op.h"
#include "../common/math.h"
#include "../common/numeric.h"
#include "../common/version.h"
#include "../common/group_data.h"
#include "../common/threading_utils.h"
Expand Down
56 changes: 6 additions & 50 deletions src/data/gradient_index.cc
Expand Up @@ -10,6 +10,7 @@

#include "../common/column_matrix.h"
#include "../common/hist_util.h"
#include "../common/numeric.h"
#include "../common/threading_utils.h"

namespace xgboost {
Expand All @@ -28,58 +29,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch,
common::Span<FeatureType const> ft,
size_t rbegin, size_t prev_sum, uint32_t nbins,
int32_t n_threads) {
// The number of threads is pegged to the batch size. If the OMP
// block is parallelized on anything other than the batch/block size,
// it should be reassigned
auto page = batch.GetView();
auto it = common::MakeIndexTransformIter([&](size_t ridx) { return page[ridx].size(); });
common::PartialSum(n_threads, it, it + page.Size(), prev_sum, row_ptr.begin() + rbegin);
// The number of threads is pegged to the batch size. If the OMP block is parallelized
// on anything other than the batch/block size, it should be reassigned
const size_t batch_threads =
std::max(static_cast<size_t>(1), std::min(batch.Size(), static_cast<size_t>(n_threads)));
auto page = batch.GetView();
common::MemStackAllocator<size_t, 128> partial_sums(batch_threads);

size_t block_size = batch.Size() / batch_threads;

dmlc::OMPException exc;
#pragma omp parallel num_threads(batch_threads)
{
#pragma omp for
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
exc.Run([&]() {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads - 1) ? batch.Size()
: (block_size * (tid + 1)));

size_t running_sum = 0;
for (size_t ridx = ibegin; ridx < iend; ++ridx) {
running_sum += page[ridx].size();
row_ptr[rbegin + 1 + ridx] = running_sum;
}
});
}

#pragma omp single
{
exc.Run([&]() {
partial_sums[0] = prev_sum;
for (size_t i = 1; i < batch_threads; ++i) {
partial_sums[i] = partial_sums[i - 1] + row_ptr[rbegin + i * block_size];
}
});
}

#pragma omp for
for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
exc.Run([&]() {
size_t ibegin = block_size * tid;
size_t iend = (tid == (batch_threads - 1) ? batch.Size()
: (block_size * (tid + 1)));

for (size_t i = ibegin; i < iend; ++i) {
row_ptr[rbegin + 1 + i] += partial_sums[tid];
}
});
}
}
exc.Rethrow();

const size_t n_index = row_ptr[rbegin + batch.Size()]; // number of entries in this page
ResizeIndex(n_index, isDense_);
Expand Down
1 change: 1 addition & 0 deletions src/objective/adaptive.cc
Expand Up @@ -7,6 +7,7 @@
#include <vector>

#include "../common/common.h"
#include "../common/numeric.h"
#include "../common/stats.h"
#include "../common/threading_utils.h"
#include "xgboost/tree_model.h"
Expand Down
33 changes: 33 additions & 0 deletions tests/cpp/common/test_numeric.cc
@@ -0,0 +1,33 @@
/*!
* Copyright 2022, XGBoost contributors.
*/
#include <gtest/gtest.h>

#include <numeric>

#include "../../../src/common/numeric.h"

namespace xgboost {
namespace common {
TEST(Numeric, PartialSum) {
{
std::vector<size_t> values{1, 2, 3, 4};
std::vector<size_t> result(values.size() + 1);
Context ctx;
PartialSum(ctx.Threads(), values.begin(), values.end(), static_cast<size_t>(0), result.begin());
std::vector<size_t> sol(values.size() + 1, 0);
std::partial_sum(values.begin(), values.end(), sol.begin() + 1);
ASSERT_EQ(sol, result);
}
{
std::vector<double> values{1.5, 2.5, 3.5, 4.5};
std::vector<double> result(values.size() + 1);
Context ctx;
PartialSum(ctx.Threads(), values.begin(), values.end(), 0.0, result.begin());
std::vector<double> sol(values.size() + 1, 0.0);
std::partial_sum(values.begin(), values.end(), sol.begin() + 1);
ASSERT_EQ(sol, result);
}
}
} // namespace common
} // namespace xgboost
1 change: 1 addition & 0 deletions tests/cpp/tree/test_approx.cc
Expand Up @@ -3,6 +3,7 @@
*/
#include <gtest/gtest.h>

#include "../../../src/common/numeric.h"
#include "../../../src/tree/updater_approx.h"
#include "../helpers.h"
#include "test_partitioner.h"
Expand Down