Extract partial sum into an independent algorithm.

dmlc · May 12, 2022 · 2565432 · 2565432
1 parent 7ef54e3
commit 2565432
Show file tree

Hide file tree

Showing 7 changed files with 137 additions and 68 deletions.
diff --git a/src/common/common.h b/src/common/common.h
@@ -274,24 +274,6 @@ template <typename Indexable>
 XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
   return indptr[group + 1] - 1;
 }
-
-/**
- * \brief Run length encode on CPU, input must be sorted.
- */
-template <typename Iter, typename Idx>
-void RunLengthEncode(Iter begin, Iter end, std::vector<Idx> *p_out) {
-  auto &out = *p_out;
-  out = std::vector<Idx>{0};
-  size_t n = std::distance(begin, end);
-  for (size_t i = 1; i < n; ++i) {
-    if (begin[i] != begin[i - 1]) {
-      out.push_back(i);
-    }
-  }
-  if (out.back() != n) {
-    out.push_back(n);
-  }
-}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/numeric.h b/src/common/numeric.h
@@ -0,0 +1,95 @@
+/*!
+ * Copyright 2022, XGBoost contributors.
+ */
+#ifndef XGBOOST_COMMON_NUMERIC_H_
+#define XGBOOST_COMMON_NUMERIC_H_
+
+#include <iterator>  // std::iterator_traits
+#include <vector>
+
+#include "threading_utils.h"
+#include "xgboost/generic_parameters.h"
+
+namespace xgboost {
+namespace common {
+
+/**
+ * \brief Run length encode on CPU, input must be sorted.
+ */
+template <typename Iter, typename Idx>
+void RunLengthEncode(Iter begin, Iter end, std::vector<Idx> *p_out) {
+  auto &out = *p_out;
+  out = std::vector<Idx>{0};
+  size_t n = std::distance(begin, end);
+  for (size_t i = 1; i < n; ++i) {
+    if (begin[i] != begin[i - 1]) {
+      out.push_back(i);
+    }
+  }
+  if (out.back() != n) {
+    out.push_back(n);
+  }
+}
+
+/**
+ * \brief Varient of std::partial_sum, out_it should point to a container that has n + 1
+ *        elements. Useful for constructing a CSR indptr.
+ */
+template <typename InIt, typename OutIt, typename T>
+void PartialSum(int32_t n_threads, InIt begin, InIt end, T init, OutIt out_it) {
+  static_assert(std::is_same<T, typename std::iterator_traits<InIt>::value_type>::value, "");
+  static_assert(std::is_same<T, typename std::iterator_traits<OutIt>::value_type>::value, "");
+  // The number of threads is pegged to the batch size. If the OMP block is parallelized
+  // on anything other than the batch/block size, it should be reassigned
+  auto n = static_cast<size_t>(std::distance(begin, end));
+  const size_t batch_threads =
+      std::max(static_cast<size_t>(1), std::min(n, static_cast<size_t>(n_threads)));
+  common::MemStackAllocator<T, 128> partial_sums(batch_threads);
+
+  size_t block_size = n / batch_threads;
+
+  dmlc::OMPException exc;
+#pragma omp parallel num_threads(batch_threads)
+  {
+#pragma omp for
+    for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
+      exc.Run([&]() {
+        size_t ibegin = block_size * tid;
+        size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1)));
+
+        T running_sum = 0;
+        for (size_t ridx = ibegin; ridx < iend; ++ridx) {
+          running_sum += *(begin + ridx);
+          *(out_it + 1 + ridx) = running_sum;
+        }
+      });
+    }
+
+#pragma omp single
+    {
+      exc.Run([&]() {
+        partial_sums[0] = init;
+        for (size_t i = 1; i < batch_threads; ++i) {
+          partial_sums[i] = partial_sums[i - 1] + *(out_it + i * block_size);
+        }
+      });
+    }
+
+#pragma omp for
+    for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
+      exc.Run([&]() {
+        size_t ibegin = block_size * tid;
+        size_t iend = (tid == (batch_threads - 1) ? n : (block_size * (tid + 1)));
+
+        for (size_t i = ibegin; i < iend; ++i) {
+          *(out_it + 1 + i) += partial_sums[tid];
+        }
+      });
+    }
+  }
+  exc.Rethrow();
+}
+}  // namespace common
+}  // namespace xgboost
+
+#endif  // XGBOOST_COMMON_NUMERIC_H_
diff --git a/src/data/data.cc b/src/data/data.cc
@@ -21,6 +21,7 @@
 #include "../common/io.h"
 #include "../common/linalg_op.h"
 #include "../common/math.h"
+#include "../common/numeric.h"
 #include "../common/version.h"
 #include "../common/group_data.h"
 #include "../common/threading_utils.h"

diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
@@ -10,6 +10,7 @@
 
 #include "../common/column_matrix.h"
 #include "../common/hist_util.h"
+#include "../common/numeric.h"
 #include "../common/threading_utils.h"
 
 namespace xgboost {
@@ -28,58 +29,13 @@ void GHistIndexMatrix::PushBatch(SparsePage const &batch,
                                  common::Span<FeatureType const> ft,
                                  size_t rbegin, size_t prev_sum, uint32_t nbins,
                                  int32_t n_threads) {
-  // The number of threads is pegged to the batch size. If the OMP
-  // block is parallelized on anything other than the batch/block size,
-  // it should be reassigned
+  auto page = batch.GetView();
+  auto it = common::MakeIndexTransformIter([&](size_t ridx) { return page[ridx].size(); });
+  common::PartialSum(n_threads, it, it + page.Size(), prev_sum, row_ptr.begin() + rbegin);
+  // The number of threads is pegged to the batch size. If the OMP block is parallelized
+  // on anything other than the batch/block size, it should be reassigned
   const size_t batch_threads =
       std::max(static_cast<size_t>(1), std::min(batch.Size(), static_cast<size_t>(n_threads)));
-  auto page = batch.GetView();
-  common::MemStackAllocator<size_t, 128> partial_sums(batch_threads);
-
-  size_t block_size = batch.Size() / batch_threads;
-
-  dmlc::OMPException exc;
-#pragma omp parallel num_threads(batch_threads)
-  {
-#pragma omp for
-    for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
-      exc.Run([&]() {
-        size_t ibegin = block_size * tid;
-        size_t iend = (tid == (batch_threads - 1) ? batch.Size()
-                                                  : (block_size * (tid + 1)));
-
-        size_t running_sum = 0;
-        for (size_t ridx = ibegin; ridx < iend; ++ridx) {
-          running_sum += page[ridx].size();
-          row_ptr[rbegin + 1 + ridx] = running_sum;
-        }
-      });
-    }
-
-#pragma omp single
-    {
-      exc.Run([&]() {
-        partial_sums[0] = prev_sum;
-        for (size_t i = 1; i < batch_threads; ++i) {
-          partial_sums[i] = partial_sums[i - 1] + row_ptr[rbegin + i * block_size];
-        }
-      });
-    }
-
-#pragma omp for
-    for (omp_ulong tid = 0; tid < batch_threads; ++tid) {
-      exc.Run([&]() {
-        size_t ibegin = block_size * tid;
-        size_t iend = (tid == (batch_threads - 1) ? batch.Size()
-                                                  : (block_size * (tid + 1)));
-
-        for (size_t i = ibegin; i < iend; ++i) {
-          row_ptr[rbegin + 1 + i] += partial_sums[tid];
-        }
-      });
-    }
-  }
-  exc.Rethrow();
 
   const size_t n_index = row_ptr[rbegin + batch.Size()];  // number of entries in this page
   ResizeIndex(n_index, isDense_);

diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
@@ -7,6 +7,7 @@
 #include <vector>
 
 #include "../common/common.h"
+#include "../common/numeric.h"
 #include "../common/stats.h"
 #include "../common/threading_utils.h"
 #include "xgboost/tree_model.h"

diff --git a/tests/cpp/common/test_numeric.cc b/tests/cpp/common/test_numeric.cc
@@ -0,0 +1,33 @@
+/*!
+ * Copyright 2022, XGBoost contributors.
+ */
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+#include "../../../src/common/numeric.h"
+
+namespace xgboost {
+namespace common {
+TEST(Numeric, PartialSum) {
+  {
+    std::vector<size_t> values{1, 2, 3, 4};
+    std::vector<size_t> result(values.size() + 1);
+    Context ctx;
+    PartialSum(ctx.Threads(), values.begin(), values.end(), static_cast<size_t>(0), result.begin());
+    std::vector<size_t> sol(values.size() + 1, 0);
+    std::partial_sum(values.begin(), values.end(), sol.begin() + 1);
+    ASSERT_EQ(sol, result);
+  }
+  {
+    std::vector<double> values{1.5, 2.5, 3.5, 4.5};
+    std::vector<double> result(values.size() + 1);
+    Context ctx;
+    PartialSum(ctx.Threads(), values.begin(), values.end(), 0.0, result.begin());
+    std::vector<double> sol(values.size() + 1, 0.0);
+    std::partial_sum(values.begin(), values.end(), sol.begin() + 1);
+    ASSERT_EQ(sol, result);
+  }
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
@@ -3,6 +3,7 @@
  */
 #include <gtest/gtest.h>
 
+#include "../../../src/common/numeric.h"
 #include "../../../src/tree/updater_approx.h"
 #include "../helpers.h"
 #include "test_partitioner.h"