diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 5300f8d2d2fba..5051a012af9e0 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -1,3 +1,6 @@ +#include +#include + #include #include @@ -7,7 +10,26 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/getCudaDrvErrorString.h" + +void setCudaLimit(cudaLimit limit, const char* name, size_t request) { + // read the current device + int device; + cudaCheck(cudaGetDevice(&device)); + // try to set the requested limit + auto result = cudaDeviceSetLimit(limit, request); + if (cudaErrorUnsupportedLimit == result) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\""; + return; + } + // read back the limit value + size_t value; + cudaCheck(cudaDeviceGetLimit(&value, limit)); + if (cudaSuccess != result) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to " << request << ", current value is " << value ; + } else if (value != request) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value << " instead of requested " << request; + } +} CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) { bool configEnabled = config.getUntrackedParameter("enabled"); @@ -21,23 +43,123 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << ".\n" << "Disabling the CUDAService."; return; } - edm::LogInfo("CUDAService") << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices"; + edm::LogInfo log("CUDAService"); + computeCapabilities_.reserve(numberOfDevices_); + log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n"; auto numberOfStreamsPerDevice = config.getUntrackedParameter("numberOfStreamsPerDevice"); if (numberOfStreamsPerDevice > 0) { numberOfStreamsTotal_ = numberOfStreamsPerDevice * numberOfDevices_; - edm::LogSystem("CUDAService") << "Number of edm::Streams per CUDA device has been set to " << numberOfStreamsPerDevice << ". With " << numberOfDevices_ << " CUDA devices, this means total of " << numberOfStreamsTotal_ << " edm::Streams for all CUDA devices."; // TODO: eventually silence to LogDebug + log << "Number of edm::Streams per CUDA device has been set to " << numberOfStreamsPerDevice << ", for a total of " << numberOfStreamsTotal_ << " edm::Streams across all CUDA device(s).\n\n"; } - computeCapabilities_.reserve(numberOfDevices_); + auto const& limits = config.getUntrackedParameter("limits"); + auto printfFifoSize = limits.getUntrackedParameter("cudaLimitPrintfFifoSize"); + auto stackSize = limits.getUntrackedParameter("cudaLimitStackSize"); + auto mallocHeapSize = limits.getUntrackedParameter("cudaLimitMallocHeapSize"); + auto devRuntimeSyncDepth = limits.getUntrackedParameter("cudaLimitDevRuntimeSyncDepth"); + auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("cudaLimitDevRuntimePendingLaunchCount"); + for (int i = 0; i < numberOfDevices_; ++i) { + // read information about the compute device. + // see the documentation of cudaGetDeviceProperties() for more information. cudaDeviceProp properties; - cudaCheck(cudaGetDeviceProperties(&properties, i)); - edm::LogInfo("CUDAService") << "Device " << i << " with compute capability " << properties.major << "." << properties.minor; + cudaCheck(cudaGetDeviceProperties(&properties, i)); + log << "CUDA device " << i << ": " << properties.name << '\n'; + log << " compute capability: " << properties.major << "." << properties.minor << '\n'; computeCapabilities_.emplace_back(properties.major, properties.minor); + + cudaCheck(cudaSetDevice(i)); + cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost)); + + // read the free and total amount of memory available for allocation by the device, in bytes. + // see the documentation of cudaMemGetInfo() for more information. + size_t freeMemory, totalMemory; + cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); + log << " memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6) << totalMemory / (1 << 20) << " MB total\n"; + log << '\n'; + + // set and read the CUDA device flags. + // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for more information. + log << "CUDA flags\n"; + unsigned int flags; + cudaCheck(cudaGetDeviceFlags(&flags)); + switch (flags & cudaDeviceScheduleMask) { + case cudaDeviceScheduleAuto: + log << " thread policy: default\n"; + break; + case cudaDeviceScheduleSpin: + log << " thread policy: spin\n"; + break; + case cudaDeviceScheduleYield: + log << " thread policy: yield\n"; + break; + case cudaDeviceScheduleBlockingSync: + log << " thread policy: blocking sync\n"; + break; + default: + log << " thread policy: undefined\n"; + } + if (flags & cudaDeviceMapHost) { + log << " pinned host memory allocations: enabled\n"; + } else { + log << " pinned host memory allocations: disabled\n"; + } + if (flags & cudaDeviceLmemResizeToMax) { + log << " kernel host memory reuse: enabled\n"; + } else { + log << " kernel host memory reuse: disabled\n"; + } + log << '\n'; + + // set and read the CUDA resource limits. + // see the documentation of cudaDeviceSetLimit() for more information. + + // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the + // printf() device system call. + if (printfFifoSize >= 0) { + setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize); + } + // cudaLimitStackSize controls the stack size in bytes of each GPU thread. + if (stackSize >= 0) { + setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize); + } + // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc() + // and free() device system calls. + if (mallocHeapSize >= 0) { + setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize); + } + if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) { + // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which + // a thread can safely call cudaDeviceSynchronize(). + if (devRuntimeSyncDepth >= 0) { + setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth); + } + // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding + // device runtime launches that can be made from the current device. + if (devRuntimePendingLaunchCount >= 0) { + setCudaLimit(cudaLimitDevRuntimePendingLaunchCount, "cudaLimitDevRuntimePendingLaunchCount", devRuntimePendingLaunchCount); + } + } + + size_t value; + log << "CUDA limits\n"; + cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize)); + log << " printf buffer size: " << std::setw(10) << value << '\n'; + cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize)); + log << " stack size: " << std::setw(10) << value << '\n'; + cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize)); + log << " malloc heap size: " << std::setw(10) << value << '\n'; + if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) { + cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth)); + log << " runtime sync depth: " << std::setw(10) << value << '\n'; + cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount)); + log << " runtime pending launch count: " << std::setw(10) << value << '\n'; + } + log << '\n'; } - edm::LogInfo("CUDAService") << "CUDAService fully initialized"; + log << "CUDAService fully initialized"; enabled_ = true; } @@ -57,31 +179,44 @@ CUDAService::~CUDAService() { void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions) { edm::ParameterSetDescription desc; desc.addUntracked("enabled", true); - desc.addUntracked("numberOfStreamsPerDevice", 0)->setComment("Upper limit of the number of edm::Streams that will run on a single CUDA GPU device. The remaining edm::Streams will be run only on other devices (for time being this means CPU in practice). The value '0' means 'unlimited', a value >= 1 imposes the limit."); + desc.addUntracked("numberOfStreamsPerDevice", 0)->setComment("Upper limit of the number of edm::Streams that will run on a single CUDA GPU device. The remaining edm::Streams will be run only on other devices (for time being this means CPU in practice).\nThe value '0' means 'unlimited', a value >= 1 imposes the limit."); + + edm::ParameterSetDescription limits; + limits.addUntracked("cudaLimitPrintfFifoSize", -1)->setComment("Size in bytes of the shared FIFO used by the printf() device system call."); + limits.addUntracked("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread."); + limits.addUntracked("cudaLimitMallocHeapSize", -1)->setComment("Size in bytes of the heap used by the malloc() and free() device system calls."); + limits.addUntracked("cudaLimitDevRuntimeSyncDepth", -1)->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize()."); + limits.addUntracked("cudaLimitDevRuntimePendingLaunchCount", -1)->setComment("Maximum number of outstanding device runtime launches that can be made from the current device."); + desc.addUntracked("limits", limits)->setComment("See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps the default value."); descriptions.add("CUDAService", desc); } int CUDAService::deviceWithMostFreeMemory() const { - size_t freeMem = 0; - int devId = -1; + // save the current device + int currentDevice; + cudaCheck(cudaGetDevice(¤tDevice)); + + size_t maxFreeMemory = 0; + int device = -1; for(int i = 0; i < numberOfDevices_; ++i) { - // TODO: understand why the api-wrappers version gives same value for all devices /* + // TODO: understand why the api-wrappers version gives same value for all devices auto device = cuda::device::get(i); - auto mem = device.memory.amount_free(); + auto freeMemory = device.memory.amount_free(); */ - size_t free, tot; + size_t freeMemory, totalMemory; cudaSetDevice(i); - cudaMemGetInfo(&free, &tot); - auto mem = free; - edm::LogPrint("CUDAService") << "Device " << i << " free memory " << mem; - if(mem > freeMem) { - freeMem = mem; - devId = i; + cudaMemGetInfo(&freeMemory, &totalMemory); + edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / " << totalMemory / (1 << 20) << " MB total memory"; + if (freeMemory > maxFreeMemory) { + maxFreeMemory = freeMemory; + device = i; } } - return devId; + // restore the current device + cudaCheck(cudaSetDevice(currentDevice)); + return device; } void CUDAService::setCurrentDevice(int device) const {