From e5ed64c81cbd14738c025e02492551fbd734c4d7 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 19 Aug 2021 19:23:03 +0200 Subject: [PATCH] Add public interface for constructing and freeing caching allocators --- .../CUDAServices/src/CUDAService.cc | 9 ++-- .../interface/cachingAllocators.h | 13 ++++++ .../src/cachingAllocatorCommon.h | 42 +++++++++++++++++++ .../CUDAUtilities/src/cachingAllocators.cc | 16 +++++++ .../src/getCachingDeviceAllocator.h | 36 ++-------------- .../src/getCachingHostAllocator.h | 6 ++- 6 files changed, 81 insertions(+), 41 deletions(-) create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h create mode 100644 HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h create mode 100644 HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 5d1bd30264186..d7f194829d159 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -16,8 +16,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" -#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h" -#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h" void setCudaLimit(cudaLimit limit, const char* name, size_t request) { // read the current device @@ -300,8 +299,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config) { // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction if constexpr (cms::cuda::allocator::useCaching) { - cms::cuda::allocator::getCachingDeviceAllocator(); - cms::cuda::allocator::getCachingHostAllocator(); + cms::cuda::allocator::cachingAllocatorsConstruct(); } cms::cuda::getEventCache().clear(); cms::cuda::getStreamCache().clear(); @@ -319,8 +317,7 @@ CUDAService::~CUDAService() { if (enabled_) { // Explicitly destruct the allocator before the device resets below if constexpr (cms::cuda::allocator::useCaching) { - cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached(); - cms::cuda::allocator::getCachingHostAllocator().FreeAllCached(); + cms::cuda::allocator::cachingAllocatorsFreeCached(); } cms::cuda::getEventCache().clear(); cms::cuda::getStreamCache().clear(); diff --git a/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h b/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h new file mode 100644 index 0000000000000..3eede5cf5c5ed --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h @@ -0,0 +1,13 @@ +#ifndef HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h +#define HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h + +namespace cms::cuda::allocator { + // Use caching or not + constexpr bool useCaching = true; + + // these intended to be called only from CUDAService + void cachingAllocatorsConstruct(); + void cachingAllocatorsFreeCached(); +} // namespace cms::cuda::allocator + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h b/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h new file mode 100644 index 0000000000000..c7f2fd038c297 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h @@ -0,0 +1,42 @@ +#ifndef HeterogeneousCore_CUDACore_src_cachingAllocatorCommon +#define HeterogeneousCore_CUDACore_src_cachingAllocatorCommon + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h" + +#include +#include + +namespace cms::cuda::allocator { + // Growth factor (bin_growth in cub::CachingDeviceAllocator + constexpr unsigned int binGrowth = 2; + // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator + constexpr unsigned int minBin = 8; + // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. + constexpr unsigned int maxBin = 30; + // Total storage for the allocator. 0 means no limit. + constexpr size_t maxCachedBytes = 0; + // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. + constexpr double maxCachedFraction = 0.8; + constexpr bool debug = false; + + inline size_t minCachedBytes() { + size_t ret = std::numeric_limits::max(); + int currentDevice; + cudaCheck(cudaGetDevice(¤tDevice)); + const int numberOfDevices = deviceCount(); + for (int i = 0; i < numberOfDevices; ++i) { + size_t freeMemory, totalMemory; + cudaCheck(cudaSetDevice(i)); + cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); + ret = std::min(ret, static_cast(maxCachedFraction * freeMemory)); + } + cudaCheck(cudaSetDevice(currentDevice)); + if (maxCachedBytes > 0) { + ret = std::min(ret, maxCachedBytes); + } + return ret; + } +} // namespace cms::cuda::allocator + +#endif diff --git a/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc b/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc new file mode 100644 index 0000000000000..8a4511eb3ac60 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc @@ -0,0 +1,16 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h" + +#include "getCachingDeviceAllocator.h" +#include "getCachingHostAllocator.h" + +namespace cms::cuda::allocator { + void cachingAllocatorsConstruct() { + cms::cuda::allocator::getCachingDeviceAllocator(); + cms::cuda::allocator::getCachingHostAllocator(); + } + + void cachingAllocatorsFreeCached() { + cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached(); + cms::cuda::allocator::getCachingHostAllocator().FreeAllCached(); + } +} // namespace cms::cuda::allocator diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h index 3770dbac574d9..492d97f800e69 100644 --- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h @@ -4,44 +4,14 @@ #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/Utilities/interface/thread_safety_macros.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h" + #include "CachingDeviceAllocator.h" +#include "cachingAllocatorCommon.h" #include namespace cms::cuda::allocator { - // Use caching or not - constexpr bool useCaching = true; - // Growth factor (bin_growth in cub::CachingDeviceAllocator - constexpr unsigned int binGrowth = 2; - // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator - constexpr unsigned int minBin = 8; - // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. - constexpr unsigned int maxBin = 30; - // Total storage for the allocator. 0 means no limit. - constexpr size_t maxCachedBytes = 0; - // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken. - constexpr double maxCachedFraction = 0.8; - constexpr bool debug = false; - - inline size_t minCachedBytes() { - size_t ret = std::numeric_limits::max(); - int currentDevice; - cudaCheck(cudaGetDevice(¤tDevice)); - const int numberOfDevices = deviceCount(); - for (int i = 0; i < numberOfDevices; ++i) { - size_t freeMemory, totalMemory; - cudaCheck(cudaSetDevice(i)); - cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); - ret = std::min(ret, static_cast(maxCachedFraction * freeMemory)); - } - cudaCheck(cudaSetDevice(currentDevice)); - if (maxCachedBytes > 0) { - ret = std::min(ret, maxCachedBytes); - } - return ret; - } - inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() { LogDebug("CachingDeviceAllocator").log([](auto& log) { log << "cub::CachingDeviceAllocator settings\n" diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h index 6e74648696dd8..a937278be01c4 100644 --- a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h +++ b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h @@ -4,9 +4,11 @@ #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/Utilities/interface/thread_safety_macros.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "CachingHostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h" -#include "getCachingDeviceAllocator.h" +#include "CachingDeviceAllocator.h" +#include "CachingHostAllocator.h" +#include "cachingAllocatorCommon.h" #include