From e5ed64c81cbd14738c025e02492551fbd734c4d7 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 19 Aug 2021 19:23:03 +0200
Subject: [PATCH] Add public interface for constructing and freeing caching
 allocators

---
 .../CUDAServices/src/CUDAService.cc           |  9 ++--
 .../interface/cachingAllocators.h             | 13 ++++++
 .../src/cachingAllocatorCommon.h              | 42 +++++++++++++++++++
 .../CUDAUtilities/src/cachingAllocators.cc    | 16 +++++++
 .../src/getCachingDeviceAllocator.h           | 36 ++--------------
 .../src/getCachingHostAllocator.h             |  6 ++-
 6 files changed, 81 insertions(+), 41 deletions(-)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 5d1bd30264186..d7f194829d159 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -16,8 +16,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h"
-#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
 
 void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
   // read the current device
@@ -300,8 +299,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config) {
 
   // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
   if constexpr (cms::cuda::allocator::useCaching) {
-    cms::cuda::allocator::getCachingDeviceAllocator();
-    cms::cuda::allocator::getCachingHostAllocator();
+    cms::cuda::allocator::cachingAllocatorsConstruct();
   }
   cms::cuda::getEventCache().clear();
   cms::cuda::getStreamCache().clear();
@@ -319,8 +317,7 @@ CUDAService::~CUDAService() {
   if (enabled_) {
     // Explicitly destruct the allocator before the device resets below
     if constexpr (cms::cuda::allocator::useCaching) {
-      cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
-      cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
+      cms::cuda::allocator::cachingAllocatorsFreeCached();
     }
     cms::cuda::getEventCache().clear();
     cms::cuda::getStreamCache().clear();
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h b/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h
new file mode 100644
index 0000000000000..3eede5cf5c5ed
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h
@@ -0,0 +1,13 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h
+#define HeterogeneousCore_CUDAUtilities_interface_cachingAllocators_h
+
+namespace cms::cuda::allocator {
+  // Use caching or not
+  constexpr bool useCaching = true;
+
+  // these intended to be called only from CUDAService
+  void cachingAllocatorsConstruct();
+  void cachingAllocatorsFreeCached();
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h b/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h
new file mode 100644
index 0000000000000..c7f2fd038c297
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/cachingAllocatorCommon.h
@@ -0,0 +1,42 @@
+#ifndef HeterogeneousCore_CUDACore_src_cachingAllocatorCommon
+#define HeterogeneousCore_CUDACore_src_cachingAllocatorCommon
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+
+#include <algorithm>
+#include <limits>
+
+namespace cms::cuda::allocator {
+  // Growth factor (bin_growth in cub::CachingDeviceAllocator
+  constexpr unsigned int binGrowth = 2;
+  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+  constexpr unsigned int minBin = 8;
+  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+  constexpr unsigned int maxBin = 30;
+  // Total storage for the allocator. 0 means no limit.
+  constexpr size_t maxCachedBytes = 0;
+  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
+  constexpr double maxCachedFraction = 0.8;
+  constexpr bool debug = false;
+
+  inline size_t minCachedBytes() {
+    size_t ret = std::numeric_limits<size_t>::max();
+    int currentDevice;
+    cudaCheck(cudaGetDevice(&currentDevice));
+    const int numberOfDevices = deviceCount();
+    for (int i = 0; i < numberOfDevices; ++i) {
+      size_t freeMemory, totalMemory;
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
+    }
+    cudaCheck(cudaSetDevice(currentDevice));
+    if (maxCachedBytes > 0) {
+      ret = std::min(ret, maxCachedBytes);
+    }
+    return ret;
+  }
+}  // namespace cms::cuda::allocator
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc b/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc
new file mode 100644
index 0000000000000..8a4511eb3ac60
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/cachingAllocators.cc
@@ -0,0 +1,16 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
+
+#include "getCachingDeviceAllocator.h"
+#include "getCachingHostAllocator.h"
+
+namespace cms::cuda::allocator {
+  void cachingAllocatorsConstruct() {
+    cms::cuda::allocator::getCachingDeviceAllocator();
+    cms::cuda::allocator::getCachingHostAllocator();
+  }
+
+  void cachingAllocatorsFreeCached() {
+    cms::cuda::allocator::getCachingDeviceAllocator().FreeAllCached();
+    cms::cuda::allocator::getCachingHostAllocator().FreeAllCached();
+  }
+}  // namespace cms::cuda::allocator
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
index 3770dbac574d9..492d97f800e69 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -4,44 +4,14 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/deviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
+
 #include "CachingDeviceAllocator.h"
+#include "cachingAllocatorCommon.h"
 
 #include <iomanip>
 
 namespace cms::cuda::allocator {
-  // Use caching or not
-  constexpr bool useCaching = true;
-  // Growth factor (bin_growth in cub::CachingDeviceAllocator
-  constexpr unsigned int binGrowth = 2;
-  // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
-  constexpr unsigned int minBin = 8;
-  // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
-  constexpr unsigned int maxBin = 30;
-  // Total storage for the allocator. 0 means no limit.
-  constexpr size_t maxCachedBytes = 0;
-  // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
-  constexpr double maxCachedFraction = 0.8;
-  constexpr bool debug = false;
-
-  inline size_t minCachedBytes() {
-    size_t ret = std::numeric_limits<size_t>::max();
-    int currentDevice;
-    cudaCheck(cudaGetDevice(&currentDevice));
-    const int numberOfDevices = deviceCount();
-    for (int i = 0; i < numberOfDevices; ++i) {
-      size_t freeMemory, totalMemory;
-      cudaCheck(cudaSetDevice(i));
-      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
-      ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
-    }
-    cudaCheck(cudaSetDevice(currentDevice));
-    if (maxCachedBytes > 0) {
-      ret = std::min(ret, maxCachedBytes);
-    }
-    return ret;
-  }
-
   inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
     LogDebug("CachingDeviceAllocator").log([](auto& log) {
       log << "cub::CachingDeviceAllocator settings\n"
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
index 6e74648696dd8..a937278be01c4 100644
--- a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
@@ -4,9 +4,11 @@
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "CachingHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
 
-#include "getCachingDeviceAllocator.h"
+#include "CachingDeviceAllocator.h"
+#include "CachingHostAllocator.h"
+#include "cachingAllocatorCommon.h"
 
 #include <iomanip>