Skip to content

Commit

Permalink
Fall back to CUB allocator if RMM memory pool is not set up (#6150)
Browse files Browse the repository at this point in the history
* Fall back to CUB allocator if RMM memory pool is not set up

* Fix build

* Prevent memory leak

* Add note about lack of memory initialisation

* Add check for other fast allocators

* Set use_cub_allocator_ to true when RMM is not enabled

* Fix clang-tidy

* Do not demangle symbol; add check to ensure Linux+Clang/GCC combo
  • Loading branch information
hcho3 committed Sep 24, 2020
1 parent 5b05f88 commit 72ef553
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 33 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Expand Up @@ -90,6 +90,12 @@ endif (USE_AVX)
if (PLUGIN_RMM AND NOT (USE_CUDA))
message(SEND_ERROR "`PLUGIN_RMM` must be enabled with `USE_CUDA` flag.")
endif (PLUGIN_RMM AND NOT (USE_CUDA))
if (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
message(SEND_ERROR "`PLUGIN_RMM` must be used with GCC or Clang compiler.")
endif (PLUGIN_RMM AND NOT ((CMAKE_CXX_COMPILER_ID STREQUAL "Clang") OR (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")))
if (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
message(SEND_ERROR "`PLUGIN_RMM` must be used with Linux.")
endif (PLUGIN_RMM AND NOT (CMAKE_SYSTEM_NAME STREQUAL "Linux"))
if (ENABLE_ALL_WARNINGS)
if ((NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") AND (NOT CMAKE_CXX_COMPILER_ID STREQUAL "GNU"))
message(SEND_ERROR "ENABLE_ALL_WARNINGS is only available for Clang and GCC.")
Expand Down
76 changes: 43 additions & 33 deletions src/common/device_helpers.cuh
Expand Up @@ -402,57 +402,67 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
return SuperT::deallocate(ptr, n);
SuperT::deallocate(ptr, n);
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBDefaultDeviceAllocatorImpl()
: SuperT(rmm::mr::get_current_device_resource(), cudaStream_t{nullptr}) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
};

#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
template <typename T>
using XGBCachingDeviceAllocatorImpl = XGBDefaultDeviceAllocatorImpl<T>;
#else
/**
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end and logs
* allocations if verbose. Does not initialise memory on construction.
* \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless
* RMM pool allocator is enabled. Does not initialise memory on construction.
*/
template <class T>
struct XGBCachingDeviceAllocatorImpl : thrust::device_malloc_allocator<T> {
struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
using SuperT = XGBBaseDeviceAllocator<T>;
using pointer = thrust::device_ptr<T>; // NOLINT
template<typename U>
struct rebind // NOLINT
{
using other = XGBCachingDeviceAllocatorImpl<U>; // NOLINT
};
cub::CachingDeviceAllocator& GetGlobalCachingAllocator ()
{
// Configure allocator with maximum cached bin size of ~1GB and no limit on
// maximum cached bytes
static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
return *allocator;
}
pointer allocate(size_t n) { // NOLINT
T *ptr;
GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void **>(&ptr),
n * sizeof(T));
pointer thrust_ptr(ptr);
GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T));
return thrust_ptr;
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
GetGlobalCachingAllocator().DeviceFree(ptr.get());
}

__host__ __device__
void construct(T *) // NOLINT
{
// no-op
cub::CachingDeviceAllocator& GetGlobalCachingAllocator() {
// Configure allocator with maximum cached bin size of ~1GB and no limit on
// maximum cached bytes
static cub::CachingDeviceAllocator *allocator = new cub::CachingDeviceAllocator(2, 9, 29);
return *allocator;
}
pointer allocate(size_t n) { // NOLINT
pointer ptr;
if (use_cub_allocator_) {
T* raw_ptr{nullptr};
GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast<void**>(&raw_ptr), n * sizeof(T));
ptr = pointer(raw_ptr);
} else {
ptr = SuperT::allocate(n);
}
GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T));
return ptr;
}
void deallocate(pointer ptr, size_t n) { // NOLINT
GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T));
if (use_cub_allocator_) {
GetGlobalCachingAllocator().DeviceFree(ptr.get());
} else {
SuperT::deallocate(ptr, n);
}
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBCachingDeviceAllocatorImpl()
: SuperT(rmm::mr::get_current_device_resource(), cudaStream_t{nullptr}) {
std::string symbol{typeid(*SuperT::resource()).name()};
if (symbol.find("pool_memory_resource") != std::string::npos
|| symbol.find("binning_memory_resource") != std::string::npos
|| symbol.find("arena_memory_resource") != std::string::npos) {
use_cub_allocator_ = false;
}
}
};
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
private:
bool use_cub_allocator_{true};
};
} // namespace detail

// Declare xgboost allocators
Expand Down

0 comments on commit 72ef553

Please sign in to comment.