From 8deb1d9dc70dd9d933dfbaf0811c7e7b744bbee3 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Sat, 16 Mar 2019 13:54:59 +0100
Subject: [PATCH] Use only CUDA devices with a supported architecture

For each available CUDA device, check if its architecture is supported running a simple kernel.
This allows to restrict using only the supported devices
  - at configuration time, in the SwitchProducerCUDA, via the cudaIsEnabled test;
  - at run time, in the CUDAService and its clients;
  - when running tests, via exitSansCUDADevices.

Includes Matti's fix for the sizes of stream and event caches (#289).
---
 HeterogeneousCore/CUDACore/src/GPUCuda.cc     | 11 ++---
 .../CUDACore/src/chooseCUDADevice.cc          |  2 +-
 .../CUDAServices/bin/BuildFile.xml            |  1 +
 .../CUDAServices/bin/cudaIsEnabled.cpp        | 28 +------------
 .../CUDAServices/interface/CUDAService.h      |  4 ++
 .../plugins/CUDAMonitoringService.cc          | 14 +++----
 .../CUDAServices/src/CUDAService.cc           | 36 ++++++++--------
 .../CUDAServices/test/testCUDAService.cpp     | 11 +++--
 .../interface/supportedCUDADevices.h          |  8 ++++
 .../CUDAUtilities/src/exitSansCUDADevices.cc  |  6 +++
 .../CUDAUtilities/src/supportedCUDADevices.cu | 42 +++++++++++++++++++
 11 files changed, 97 insertions(+), 66 deletions(-)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
index d712dcba75a8b..c26ecce0f854a 100644
--- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc
+++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
@@ -1,4 +1,5 @@
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "chooseCUDADevice.h"
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
@@ -34,20 +35,14 @@ namespace heterogeneous {
       return;
     }
 
-    // For startes we "statically" assign the device based on
-    // edm::Stream number. This is suboptimal if the number of
-    // edm::Streams is not a multiple of the number of CUDA devices
-    // (and even then there is no load balancing).
-    //
-    // TODO: improve. Possible ideas include
+    // TODO: possible ideas to improve the "assignment" logic include
     // - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
     //   * the first module of a chain dictates the device for the rest of the chain
     // - our own CUDA memory allocator
     //   * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
     //   * would probably still need some buffer space/device to hold e.g. conditions data
     //     - for conditions, how to handle multiple lumis per job?
-    deviceId_ = id % cudaService->numberOfDevices();
-
+    deviceId_ = cudacore::chooseCUDADevice(id);
     cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
 
     // Create the CUDA stream for this module-edm::Stream pair
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
index a582ed2f72866..ce487507500cc 100644
--- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -13,6 +13,6 @@ namespace cudacore {
     // (and even then there is no load balancing).
     //
     // TODO: improve the "assignment" logic
-    return id % cudaService->numberOfDevices();
+    return cudaService->devices()[id % cudaService->numberOfDevices()];
   }
 }
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
index 041ed25ba134a..58ce8cc807515 100644
--- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -4,4 +4,5 @@
 
 <bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
   <use name="cuda"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
 </bin>
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
index b24f05adb2213..db6e7dd141c19 100644
--- a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -1,31 +1,7 @@
-#include <algorithm>
-#include <array>
 #include <cstdlib>
-#include <iostream>
 
-#include <cuda_runtime.h>
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
 
 int main() {
-  int devices = 0;
-  auto status = cudaGetDeviceCount(& devices);
-  if (status != cudaSuccess) {
-    return EXIT_FAILURE;
-  }
-
-  int minimumMajor = 6; // min minor is implicitly 0
-
-  // This approach (requiring all devices are supported) is rather
-  // conservative. In principle we could consider just dropping the
-  // unsupported devices. Currently that would be easiest to achieve
-  // in CUDAService though.
-  for (int i = 0; i < devices; ++i) {
-    cudaDeviceProp properties;
-    cudaGetDeviceProperties(&properties, i);
-
-    if(properties.major < minimumMajor) {
-      return EXIT_FAILURE;
-    }
-  }
-
-  return EXIT_SUCCESS;
+  return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
 }
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index e54ec1be8ad20..7125b2c0dcf6e 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -52,6 +52,9 @@ class CUDAService {
 
   int numberOfDevices() const { return numberOfDevices_; }
 
+  // devices supported by the CUDA configuration and compilation flags
+  std::vector<int> const& devices() const { return supportedDevices_; }
+
   // major, minor
   std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }
 
@@ -152,6 +155,7 @@ class CUDAService {
   std::unique_ptr<CUDAEventCache> cudaEventCache_;
 
   int numberOfDevices_ = 0;
+  std::vector<int> supportedDevices_;
   std::vector<std::pair<int, int>> computeCapabilities_;
   bool enabled_ = false;
 };
diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
index 7b7711c63c502..5c1d042a6420b 100644
--- a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -30,7 +30,7 @@ class CUDAMonitoringService {
   void postEvent(edm::StreamContext const& sc);
 
 private:
-  int numberOfDevices_ = 0;
+  std::vector<int> devices_;
 };
 
 CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
@@ -38,7 +38,7 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed
   edm::Service<CUDAService> cudaService;
   if(!cudaService->enabled())
     return;
-  numberOfDevices_ = cudaService->numberOfDevices();
+  devices_ = cudaService->devices();
 
   if(config.getUntrackedParameter<bool>("memoryConstruction")) {
     registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
@@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
 // activity handlers
 namespace {
   template <typename T>
-  void dumpUsedMemory(T& log, int num) {
+  void dumpUsedMemory(T& log, std::vector<int> const& devices) {
     int old = 0;
     cudaCheck(cudaGetDevice(&old));
-    for(int i = 0; i < num; ++i) {
+    for(int i: devices) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -82,19 +82,19 @@ namespace {
 void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
-  dumpUsedMemory(log, numberOfDevices_);
+  dumpUsedMemory(log, devices_);
 }
 
 void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
-  dumpUsedMemory(log, numberOfDevices_);
+  dumpUsedMemory(log, devices_);
 }
 
 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after event";
-  dumpUsedMemory(log, numberOfDevices_);
+  dumpUsedMemory(log, devices_);
 }
 
 DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 9db5d89de1f83..2d6e0bfc6fd09 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -12,6 +12,7 @@
 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
 
 #include "CachingDeviceAllocator.h"
 #include "CachingHostAllocator.h"
@@ -94,10 +95,10 @@ namespace {
     }
   }
 
-  void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
+  void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
     int device;
     cudaCheck(cudaGetDevice(&device));
-    for(int i=0; i<numberOfDevices; ++i) {
+    for (int i : cs.devices()) {
       cudaCheck(cudaSetDevice(i));
       preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
           return cs.make_device_unique<char[]>(size, stream);
@@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     return;
   }
 
-  auto status = cudaGetDeviceCount(&numberOfDevices_);
-  if (cudaSuccess != status) {
+  supportedDevices_ = supportedCUDADevices();
+  numberOfDevices_ = supportedDevices_.size();
+  if (numberOfDevices_ == 0) {
     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
     return;
   }
   edm::LogInfo log("CUDAService");
-  computeCapabilities_.reserve(numberOfDevices_);
-  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
+  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
 
   auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
   auto printfFifoSize               = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
@@ -137,7 +138,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   auto devRuntimeSyncDepth          = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
 
-  for (int i = 0; i < numberOfDevices_; ++i) {
+  int lastDevice = supportedDevices_.back();
+  computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
+  for (int i: supportedDevices_) {
     // read information about the compute device.
     // see the documentation of cudaGetDeviceProperties() for more information.
     cudaDeviceProp properties;
@@ -145,10 +148,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     log << "CUDA device " << i << ": " << properties.name << '\n';
 
     // compute capabilities
+    computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
     log << "  compute capability:          " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
-    computeCapabilities_.emplace_back(properties.major, properties.minor);
     log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
-    log << "  CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
+    log << "  CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
     log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
 
     // compute mode
@@ -291,7 +294,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     size_t minCachedBytes = std::numeric_limits<size_t>::max();
     int currentDevice;
     cudaCheck(cudaGetDevice(&currentDevice));
-    for (int i = 0; i < numberOfDevices_; ++i) {
+    for (int i: supportedDevices_) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -331,8 +334,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     log << "cub::CachingDeviceAllocator disabled\n";
   }
 
-  cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_);
-  cudaEventCache_ = std::make_unique<CUDAEventCache>(numberOfDevices_);
+  cudaStreamCache_ = std::make_unique<CUDAStreamCache>(lastDevice+1);
+  cudaEventCache_ = std::make_unique<CUDAEventCache>(lastDevice+1);
 
   log << "\n";
 
@@ -340,7 +343,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   enabled_ = true;
 
   // Preallocate buffers if asked to
-  devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
+  devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
   hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
 }
 
@@ -353,7 +356,7 @@ CUDAService::~CUDAService() {
     cudaEventCache_.reset();
     cudaStreamCache_.reset();
 
-    for (int i = 0; i < numberOfDevices_; ++i) {
+    for (int i: supportedDevices_) {
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());
       // Explicitly destroys and cleans up all resources associated with the current device in the
@@ -398,7 +401,7 @@ int CUDAService::deviceWithMostFreeMemory() const {
 
   size_t maxFreeMemory = 0;
   int device = -1;
-  for(int i = 0; i < numberOfDevices_; ++i) {
+  for (int i: supportedDevices_) {
     /*
     // TODO: understand why the api-wrappers version gives same value for all devices
     auto device = cuda::device::get(i);
@@ -432,9 +435,6 @@ struct CUDAService::Allocator {
   template <typename ...Args>
   Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}
 
-  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
-  void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
-
   size_t maxAllocation;
   notcub::CachingDeviceAllocator deviceAllocator;
   notcub::CachingHostAllocator hostAllocator;
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index 95768bdbd4b58..5e1bc65645841 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -14,6 +14,7 @@
 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
 
 namespace {
   CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
 
   // Test setup: check if a simple CUDA runtime API call fails:
   // if so, skip the test with the CUDAService enabled
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
+  int deviceCount = supportedCUDADevices().size();
 
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Running only tests not requiring devices.");
+  if (deviceCount == 0) {
+    WARN("No supported CUDA devices available. Running only tests not requiring devices.");
   }
 
   SECTION("CUDAService enabled") {
@@ -58,6 +56,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     }
 
     auto cs = makeCUDAService(ps, ar);
+    cudaError_t ret;
 
     SECTION("CUDA Queries") {
       int driverVersion = 0, runtimeVersion = 0;
diff --git a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
new file mode 100644
index 0000000000000..53d984dd2beaa
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
@@ -0,0 +1,8 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
+#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
+
+#include <vector>
+
+std::vector<int> supportedCUDADevices();
+
+#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
index 2d166e5c62840..c20f43c5ec794 100644
--- a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
@@ -4,6 +4,7 @@
 #include <cuda_runtime.h>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
 
 void exitSansCUDADevices() {
   int devices = 0;
@@ -16,4 +17,9 @@ void exitSansCUDADevices() {
     std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
     exit(EXIT_SUCCESS);
   }
+  int supported = supportedCUDADevices().size();
+  if (supported == 0) {
+    std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
+    exit(EXIT_SUCCESS);
+  }
 }
diff --git a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
new file mode 100644
index 0000000000000..9d629d2fc7554
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
@@ -0,0 +1,42 @@
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
+
+__global__
+void isSupported(bool * result) {
+  * result = true;
+}
+
+std::vector<int> supportedCUDADevices() {
+  int devices = 0;
+  auto status = cudaGetDeviceCount(&devices);
+  if (status != cudaSuccess or devices == 0) {
+    return {};
+  }
+
+  std::vector<int> supportedDevices;
+  supportedDevices.reserve(devices);
+
+  for (int i = 0; i < devices; ++i) {
+    cudaCheck(cudaSetDevice(i));
+    bool supported = false;
+    bool * supported_d;
+    cudaCheck(cudaMalloc(&supported_d, sizeof(bool)));
+    cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool)));
+    isSupported<<<1,1>>>(supported_d);
+    // swallow any eventual error from launching the kernel on an unsupported device
+    cudaGetLastError();
+    cudaCheck(cudaDeviceSynchronize());
+    cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost));
+    cudaCheck(cudaFree(supported_d));
+    if (supported) {
+      supportedDevices.push_back(i);
+    }
+    cudaCheck(cudaDeviceReset());
+  }
+
+  return supportedDevices;
+}