Loop over copy_if (#6201)

* Loop over copy_if * Catch OOM. Co-authored-by: fis <jm.yuan@outlook.com>
dmlc · Oct 13, 2020 · 734a911 · 734a911
1 parent 0fc263e
commit 734a911
Show file tree

Hide file tree

Showing 3 changed files with 64 additions and 18 deletions.
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
@@ -129,6 +129,12 @@ inline size_t AvailableMemory(int device_idx) {
   return device_free;
 }
 
+inline int32_t CurrentDevice() {
+  int32_t device = 0;
+  safe_cuda(cudaGetDevice(&device));
+  return device;
+}
+
 inline size_t TotalMemory(int device_idx) {
   size_t device_free = 0;
   size_t device_total = 0;
@@ -384,6 +390,16 @@ template <typename T>
 using XGBBaseDeviceAllocator = thrust::device_malloc_allocator<T>;
 #endif  // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
 
+inline void ThrowOOMError(std::string const& err, size_t bytes) {
+  auto device = CurrentDevice();
+  auto rank = rabit::GetRank();
+  std::stringstream ss;
+  ss << "Memory allocation error on worker " << rank << ": " << err << "\n"
+     << "- Free memory: " << AvailableMemory(device) << "\n"
+     << "- Requested memory: " << bytes << std::endl;
+  LOG(FATAL) << ss.str();
+}
+
 /**
  * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose.
  */
@@ -397,7 +413,13 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
     using other = XGBDefaultDeviceAllocatorImpl<U>;  // NOLINT
   };
   pointer allocate(size_t n) {  // NOLINT
-    pointer ptr = SuperT::allocate(n);
+    pointer ptr;
+    try {
+      ptr = SuperT::allocate(n);
+      dh::safe_cuda(cudaGetLastError());
+    } catch (const std::exception &e) {
+      ThrowOOMError(e.what(), n * sizeof(T));
+    }
     GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
     return ptr;
   }
@@ -432,8 +454,11 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
   }
   pointer allocate(size_t n) {  // NOLINT
     T* ptr;
-    GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
-                                               n * sizeof(T));
+    auto errc =  GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
+                                                            n * sizeof(T));
+    if (errc != cudaSuccess) {
+      ThrowOOMError("Caching allocator", n * sizeof(T));
+    }
     pointer thrust_ptr{ ptr };
     GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
     return thrust_ptr;

diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
@@ -35,24 +35,38 @@ void CountRowOffsets(const AdapterBatchT& batch, common::Span<bst_row_t> offset,
       thrust::device_pointer_cast(offset.data()));
 }
 
+template <typename AdapterBatchT>
+struct COOToEntryOp {
+  AdapterBatchT batch;
+  __device__ Entry operator()(size_t idx) {
+    const auto& e = batch.GetElement(idx);
+    return Entry(e.column_idx, e.value);
+  }
+};
+
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
 template <typename AdapterT>
 void CopyDataToDMatrix(AdapterT* adapter, common::Span<Entry> data,
-                       int device_idx, float missing,
-                       common::Span<size_t> row_ptr) {
-  auto& batch = adapter->Value();
-  auto transform_f = [=] __device__(size_t idx) {
-    const auto& e = batch.GetElement(idx);
-    return Entry(e.column_idx, e.value);
-  };  // NOLINT
+                       float missing) {
+  auto batch = adapter->Value();
   auto counting = thrust::make_counting_iterator(0llu);
-  thrust::transform_iterator<decltype(transform_f), decltype(counting), Entry>
-      transform_iter(counting, transform_f);
   dh::XGBCachingDeviceAllocator<char> alloc;
-  thrust::copy_if(
-      thrust::cuda::par(alloc), transform_iter, transform_iter + batch.Size(),
-      thrust::device_pointer_cast(data.data()), IsValidFunctor(missing));
+  COOToEntryOp<decltype(batch)> transform_op{batch};
+  thrust::transform_iterator<decltype(transform_op), decltype(counting)>
+      transform_iter(counting, transform_op);
+  // We loop over batches because thrust::copy_if cant deal with sizes > 2^31
+  // See thrust issue #1302
+  size_t max_copy_size = std::numeric_limits<int>::max() / 2;
+  auto begin_output = thrust::device_pointer_cast(data.data());
+  for (size_t offset = 0; offset < batch.Size(); offset += max_copy_size) {
+    auto begin_input = transform_iter + offset;
+    auto end_input =
+        transform_iter + std::min(offset + max_copy_size, batch.Size());
+    begin_output =
+        thrust::copy_if(thrust::cuda::par(alloc), begin_input, end_input,
+                        begin_output, IsValidFunctor(missing));
+  }
 }
 
 // Does not currently support metainfo as no on-device data source contains this
@@ -77,8 +91,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
   CountRowOffsets(batch, s_offset, adapter->DeviceIdx(), missing);
   info_.num_nonzero_ = sparse_page_.offset.HostVector().back();
   sparse_page_.data.Resize(info_.num_nonzero_);
-  CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(),
-                    adapter->DeviceIdx(), missing, s_offset);
+  CopyDataToDMatrix(adapter, sparse_page_.data.DeviceSpan(), missing);
 
   info_.num_col_ = adapter->NumColumns();
   info_.num_row_ = adapter->NumRows();

diff --git a/tests/cpp/common/test_device_helpers.cu b/tests/cpp/common/test_device_helpers.cu
@@ -156,5 +156,13 @@ TEST(SegmentedUnique, Regression) {
     TestSegmentedUniqueRegression(values, 0);
   }
 }
+
+TEST(Allocator, OOM) {
+  auto size = dh::AvailableMemory(0) * 4;
+  ASSERT_THROW({dh::caching_device_vector<char> vec(size);}, dmlc::Error);
+  ASSERT_THROW({dh::device_vector<char> vec(size);}, dmlc::Error);
+  // Clear last error so we don't fail subsequent tests
+  cudaGetLastError();
+}
 }  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost