From c8067d7f08aa6f1b1f0f0f6baae5e43e2c912473 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Wed, 1 Aug 2018 15:01:56 +0200
Subject: [PATCH] Extend the CUDAService to support CUDA device flags and
 limits (#103)

Print a single LogInfo status message with all the devices' details.

Set the CUDA device flags (hard coded) and print them.
Configure the CUDA device limits and print them.

See the documentation of cudaSetDeviceFlags and cudaDeviceSetLimit for more information.
---
 .../CUDAServices/src/CUDAService.cc           | 175 ++++++++++++++++--
 1 file changed, 155 insertions(+), 20 deletions(-)
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 5300f8d2d2fba..5051a012af9e0 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -1,3 +1,6 @@
+#include <iomanip>
+#include <iostream>
+
 #include <cuda.h>
 #include <cuda/api_wrappers.h>
 
@@ -7,7 +10,26 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/getCudaDrvErrorString.h"
+
+void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
+  // read the current device
+  int device;
+  cudaCheck(cudaGetDevice(&device));
+  // try to set the requested limit
+  auto result = cudaDeviceSetLimit(limit, request);
+  if (cudaErrorUnsupportedLimit == result) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
+    return;
+  }
+  // read back the limit value
+  size_t value;
+  cudaCheck(cudaDeviceGetLimit(&value, limit));
+  if (cudaSuccess != result) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to " << request << ", current value is " << value ;
+  } else if (value != request) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value << " instead of requested " << request;
+  }
+}
 
 CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) {
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
@@ -21,23 +43,123 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << ".\n" << "Disabling the CUDAService.";
     return;
   }
-  edm::LogInfo("CUDAService") << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices";
+  edm::LogInfo log("CUDAService");
+  computeCapabilities_.reserve(numberOfDevices_);
+  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
 
   auto numberOfStreamsPerDevice = config.getUntrackedParameter<unsigned int>("numberOfStreamsPerDevice");
   if (numberOfStreamsPerDevice > 0) {
     numberOfStreamsTotal_ = numberOfStreamsPerDevice * numberOfDevices_;
-    edm::LogSystem("CUDAService") << "Number of edm::Streams per CUDA device has been set to " << numberOfStreamsPerDevice << ". With " << numberOfDevices_ << " CUDA devices, this means total of " << numberOfStreamsTotal_ << " edm::Streams for all CUDA devices."; // TODO: eventually silence to LogDebug
+    log << "Number of edm::Streams per CUDA device has been set to " << numberOfStreamsPerDevice << ", for a total of " << numberOfStreamsTotal_ << " edm::Streams across all CUDA device(s).\n\n";
   }
 
-  computeCapabilities_.reserve(numberOfDevices_);
+  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
+  auto printfFifoSize               = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
+  auto stackSize                    = limits.getUntrackedParameter<int>("cudaLimitStackSize");
+  auto mallocHeapSize               = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
+  auto devRuntimeSyncDepth          = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
+  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
+
   for (int i = 0; i < numberOfDevices_; ++i) {
+    // read information about the compute device.
+    // see the documentation of cudaGetDeviceProperties() for more information.
     cudaDeviceProp properties;
-    cudaCheck(cudaGetDeviceProperties(&properties, i)); 
-    edm::LogInfo("CUDAService") << "Device " << i << " with compute capability " << properties.major << "." << properties.minor;
+    cudaCheck(cudaGetDeviceProperties(&properties, i));
+    log << "CUDA device " << i << ": " << properties.name << '\n';
+    log << "  compute capability: " << properties.major << "." << properties.minor << '\n';
     computeCapabilities_.emplace_back(properties.major, properties.minor);
+
+    cudaCheck(cudaSetDevice(i));
+    cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
+
+    // read the free and total amount of memory available for allocation by the device, in bytes.
+    // see the documentation of cudaMemGetInfo() for more information.
+    size_t freeMemory, totalMemory;
+    cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+    log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6) << totalMemory / (1 << 20) << " MB total\n";
+    log << '\n';
+
+    // set and read the CUDA device flags.
+    // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
+    log << "CUDA flags\n";
+    unsigned int flags;
+    cudaCheck(cudaGetDeviceFlags(&flags));
+    switch (flags & cudaDeviceScheduleMask) {
+      case cudaDeviceScheduleAuto:
+        log << "  thread policy:                   default\n";
+        break;
+      case cudaDeviceScheduleSpin:
+        log << "  thread policy:                      spin\n";
+        break;
+      case cudaDeviceScheduleYield:
+        log << "  thread policy:                     yield\n";
+        break;
+      case cudaDeviceScheduleBlockingSync:
+        log << "  thread policy:             blocking sync\n";
+        break;
+      default:
+        log << "  thread policy:                 undefined\n";
+    }
+    if (flags & cudaDeviceMapHost) {
+      log << "  pinned host memory allocations:  enabled\n";
+    } else {
+      log << "  pinned host memory allocations: disabled\n";
+    }
+    if (flags & cudaDeviceLmemResizeToMax) {
+      log << "  kernel host memory reuse:        enabled\n";
+    } else {
+      log << "  kernel host memory reuse:       disabled\n";
+    }
+    log << '\n';
+
+    // set and read the CUDA resource limits.
+    // see the documentation of cudaDeviceSetLimit() for more information.
+
+    // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
+    // printf() device system call.
+    if (printfFifoSize >= 0) {
+      setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
+    }
+    // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
+    if (stackSize >= 0) {
+      setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
+    }
+    // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
+    // and free() device system calls.
+    if (mallocHeapSize >= 0) {
+      setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
+    }
+    if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+      // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
+      // a thread can safely call cudaDeviceSynchronize().
+      if (devRuntimeSyncDepth >= 0) {
+        setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
+      }
+      // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
+      // device runtime launches that can be made from the current device.
+      if (devRuntimePendingLaunchCount >= 0) {
+        setCudaLimit(cudaLimitDevRuntimePendingLaunchCount, "cudaLimitDevRuntimePendingLaunchCount", devRuntimePendingLaunchCount);
+      }
+    }
+
+    size_t value;
+    log << "CUDA limits\n";
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
+    log << "  printf buffer size:           " << std::setw(10) << value << '\n';
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
+    log << "  stack size:                   " << std::setw(10) << value << '\n';
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
+    log << "  malloc heap size:             " << std::setw(10) << value << '\n';
+    if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+      cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
+      log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
+      cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
+      log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
+    }
+    log << '\n';
   }
 
-  edm::LogInfo("CUDAService") << "CUDAService fully initialized";
+  log << "CUDAService fully initialized";
   enabled_ = true;
 }
 
@@ -57,31 +179,44 @@ CUDAService::~CUDAService() {
 void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions) {
   edm::ParameterSetDescription desc;
   desc.addUntracked<bool>("enabled", true);
-  desc.addUntracked<unsigned int>("numberOfStreamsPerDevice", 0)->setComment("Upper limit of the number of edm::Streams that will run on a single CUDA GPU device. The remaining edm::Streams will be run only on other devices (for time being this means CPU in practice). The value '0' means 'unlimited', a value >= 1 imposes the limit.");
+  desc.addUntracked<unsigned int>("numberOfStreamsPerDevice", 0)->setComment("Upper limit of the number of edm::Streams that will run on a single CUDA GPU device. The remaining edm::Streams will be run only on other devices (for time being this means CPU in practice).\nThe value '0' means 'unlimited', a value >= 1 imposes the limit.");
+
+  edm::ParameterSetDescription limits;
+  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
+  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
+  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
+  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
+  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
+  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)->setComment("See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps the default value.");
 
   descriptions.add("CUDAService", desc);
 }
 
 int CUDAService::deviceWithMostFreeMemory() const {
-  size_t freeMem = 0;
-  int devId = -1;
+  // save the current device
+  int currentDevice;
+  cudaCheck(cudaGetDevice(&currentDevice));
+
+  size_t maxFreeMemory = 0;
+  int device = -1;
   for(int i = 0; i < numberOfDevices_; ++i) {
-    // TODO: understand why the api-wrappers version gives same value for all devices
     /*
+    // TODO: understand why the api-wrappers version gives same value for all devices
     auto device = cuda::device::get(i);
-    auto mem = device.memory.amount_free();
+    auto freeMemory = device.memory.amount_free();
     */
-    size_t free, tot;
+    size_t freeMemory, totalMemory;
     cudaSetDevice(i);
-    cudaMemGetInfo(&free, &tot);
-    auto mem = free;
-    edm::LogPrint("CUDAService") << "Device " << i << " free memory " << mem;
-    if(mem > freeMem) {
-      freeMem = mem;
-      devId = i;
+    cudaMemGetInfo(&freeMemory, &totalMemory);
+    edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / " << totalMemory / (1 << 20) << " MB total memory";
+    if (freeMemory > maxFreeMemory) {
+      maxFreeMemory = freeMemory;
+      device = i;
     }
   }
-  return devId;
+  // restore the current device
+  cudaCheck(cudaSetDevice(currentDevice));
+  return device;
 }
 
 void CUDAService::setCurrentDevice(int device) const {