Merge pull request #34950 from fwyzard/CUDA_fix_concurrent_EventSetup…

…_filling Fix uploading the EventSetup conditions to multiple CUDA devices [11.3.x]
cms-sw · Aug 23, 2021 · 944e28b · 944e28b
2 parents 07247b0 + c9c01df
commit 944e28b
Show file tree

Hide file tree

Showing 2 changed files with 31 additions and 12 deletions.
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
@@ -9,6 +9,7 @@
 #include "FWCore/Utilities/interface/thread_safety_macros.h"
 #include "HeterogeneousCore/CUDAServices/interface/numberOfDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
@@ -19,23 +20,26 @@ namespace cms {
     class ESProduct {
     public:
       ESProduct() : gpuDataPerDevice_(numberOfDevices()) {
-        for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
-          gpuDataPerDevice_[i].m_event = getEventCache().get();
+        if (not gpuDataPerDevice_.empty()) {
+          cms::cuda::ScopedSetDevice scopedDevice;
+          for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
+            scopedDevice.set(i);
+            gpuDataPerDevice_[i].m_event = getEventCache().get();
+          }
         }
       }
+
       ~ESProduct() = default;
 
       // transferAsync should be a function of (T&, cudaStream_t)
       // which enqueues asynchronous transfers (possibly kernels as well)
       // to the CUDA stream
       template <typename F>
       const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
-        auto device = currentDevice();
-
+        int device = currentDevice();
         auto& data = gpuDataPerDevice_[device];
 
-        // If GPU data has already been filled, we can return it
-        // immediately
+        // If the GPU data has already been filled, we can return it immediately
         if (not data.m_filled.load()) {
           // It wasn't, so need to fill it
           std::scoped_lock<std::mutex> lk{data.m_mutex};
@@ -103,4 +107,4 @@ namespace cms {
   }  // namespace cuda
 }  // namespace cms
 
-#endif
+#endif  // HeterogeneousCore_CUDACore_ESProduct_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h b/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
@@ -9,20 +9,35 @@ namespace cms {
   namespace cuda {
     class ScopedSetDevice {
     public:
-      explicit ScopedSetDevice(int newDevice) {
-        cudaCheck(cudaGetDevice(&prevDevice_));
-        cudaCheck(cudaSetDevice(newDevice));
+      // Store the original device, without setting a new one
+      ScopedSetDevice() {
+        // Store the original device
+        cudaCheck(cudaGetDevice(&originalDevice_));
       }
 
+      // Store the original device, and set a new current device
+      explicit ScopedSetDevice(int device) : ScopedSetDevice() {
+        // Change the current device
+        set(device);
+      }
+
+      // Restore the original device
       ~ScopedSetDevice() {
         // Intentionally don't check the return value to avoid
         // exceptions to be thrown. If this call fails, the process is
         // doomed anyway.
-        cudaSetDevice(prevDevice_);
+        cudaSetDevice(originalDevice_);
+      }
+
+      // Set a new current device, without changing the original device
+      // that will be restored when this object is destroyed
+      void set(int device) {
+        // Change the current device
+        cudaCheck(cudaSetDevice(device));
       }
 
     private:
-      int prevDevice_;
+      int originalDevice_;
     };
   }  // namespace cuda
 }  // namespace cms