Skip to content

Commit

Permalink
Use only CUDA devices with a supported architecture
Browse files Browse the repository at this point in the history
For each available CUDA device, check if its architecture is supported running a simple kernel.
This allows to restrict using only the supported devices
  - at configuration time, in the SwitchProducerCUDA, via the cudaIsEnabled test;
  - at run time, in the CUDAService and its clients;
  - when running tests, via exitSansCUDADevices.

Includes Matti's fix for the sizes of stream and event caches (#289).
  • Loading branch information
fwyzard committed Mar 24, 2019
1 parent 07607a4 commit 2be0bc1
Show file tree
Hide file tree
Showing 13 changed files with 147 additions and 66 deletions.
11 changes: 3 additions & 8 deletions HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
#include "chooseCUDADevice.h"

#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ServiceRegistry/interface/Service.h"
Expand Down Expand Up @@ -34,20 +35,14 @@ namespace heterogeneous {
return;
}

// For startes we "statically" assign the device based on
// edm::Stream number. This is suboptimal if the number of
// edm::Streams is not a multiple of the number of CUDA devices
// (and even then there is no load balancing).
//
// TODO: improve. Possible ideas include
// TODO: possible ideas to improve the "assignment" logic include
// - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
// * the first module of a chain dictates the device for the rest of the chain
// - our own CUDA memory allocator
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
deviceId_ = id % cudaService->numberOfDevices();

deviceId_ = cudacore::chooseCUDADevice(id);
cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);

// Create the CUDA stream for this module-edm::Stream pair
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
return id % cudaService->numberOfDevices();
return cudaService->devices()[id % cudaService->numberOfDevices()];
}
}
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAServices/bin/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@

<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
<use name="cuda"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
</bin>
28 changes: 2 additions & 26 deletions HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,7 @@
#include <algorithm>
#include <array>
#include <cstdlib>
#include <iostream>

#include <cuda_runtime.h>
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

int main() {
int devices = 0;
auto status = cudaGetDeviceCount(& devices);
if (status != cudaSuccess) {
return EXIT_FAILURE;
}

int minimumMajor = 6; // min minor is implicitly 0

// This approach (requiring all devices are supported) is rather
// conservative. In principle we could consider just dropping the
// unsupported devices. Currently that would be easiest to achieve
// in CUDAService though.
for (int i = 0; i < devices; ++i) {
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, i);

if(properties.major < minimumMajor) {
return EXIT_FAILURE;
}
}

return EXIT_SUCCESS;
return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
}
4 changes: 4 additions & 0 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class CUDAService {

int numberOfDevices() const { return numberOfDevices_; }

// devices supported by the CUDA configuration and compilation flags
std::vector<int> const& devices() const { return supportedDevices_; }

// major, minor
std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }

Expand Down Expand Up @@ -152,6 +155,7 @@ class CUDAService {
std::unique_ptr<CUDAEventCache> cudaEventCache_;

int numberOfDevices_ = 0;
std::vector<int> supportedDevices_;
std::vector<std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
};
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h

#include <vector>

std::vector<int> supportedCUDADevices();

#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
14 changes: 7 additions & 7 deletions HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);

private:
int numberOfDevices_ = 0;
std::vector<int> devices_;
};

CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled())
return;
numberOfDevices_ = cudaService->numberOfDevices();
devices_ = cudaService->devices();

if(config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
Expand Down Expand Up @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, int num) {
void dumpUsedMemory(T& log, std::vector<int> const& devices) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
for(int i = 0; i < num; ++i) {
for(int i: devices) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand All @@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

DEFINE_FWK_SERVICE(CUDAMonitoringService);
36 changes: 18 additions & 18 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
Expand Down Expand Up @@ -94,10 +95,10 @@ namespace {
}
}

void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
for(int i=0; i<numberOfDevices; ++i) {
for (int i : cs.devices()) {
cudaCheck(cudaSetDevice(i));
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique<char[]>(size, stream);
Expand All @@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}

auto status = cudaGetDeviceCount(&numberOfDevices_);
if (cudaSuccess != status) {
supportedDevices_ = supportedCUDADevices();
numberOfDevices_ = supportedDevices_.size();
if (numberOfDevices_ == 0) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
computeCapabilities_.reserve(numberOfDevices_);
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";

auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
Expand All @@ -137,18 +138,20 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");

for (int i = 0; i < numberOfDevices_; ++i) {
int lastDevice = supportedDevices_.back();
computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
for (int i: supportedDevices_) {
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
cudaCheck(cudaGetDeviceProperties(&properties, i));
log << "CUDA device " << i << ": " << properties.name << '\n';

// compute capabilities
computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";

// compute mode
Expand Down Expand Up @@ -291,7 +294,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
for (int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand Down Expand Up @@ -331,16 +334,16 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
log << "cub::CachingDeviceAllocator disabled\n";
}

cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_);
cudaEventCache_ = std::make_unique<CUDAEventCache>(numberOfDevices_);
cudaStreamCache_ = std::make_unique<CUDAStreamCache>(lastDevice+1);
cudaEventCache_ = std::make_unique<CUDAEventCache>(lastDevice+1);

log << "\n";

log << "CUDAService fully initialized";
enabled_ = true;

// Preallocate buffers if asked to
devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
}

Expand All @@ -353,7 +356,7 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();

for (int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
Expand Down Expand Up @@ -398,7 +401,7 @@ int CUDAService::deviceWithMostFreeMemory() const {

size_t maxFreeMemory = 0;
int device = -1;
for(int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
Expand Down Expand Up @@ -432,9 +435,6 @@ struct CUDAService::Allocator {
template <typename ...Args>
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}

void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);

size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
Expand Down
42 changes: 42 additions & 0 deletions HeterogeneousCore/CUDAServices/src/supportedCUDADevices.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <vector>

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAServices/interface/supportedCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

__global__
void isSupported(bool * result) {
* result = true;
}

std::vector<int> supportedCUDADevices() {
int devices = 0;
auto status = cudaGetDeviceCount(&devices);
if (status != cudaSuccess or devices == 0) {
return {};
}

std::vector<int> supportedDevices;
supportedDevices.reserve(devices);

for (int i = 0; i < devices; ++i) {
cudaCheck(cudaSetDevice(i));
bool supported = false;
bool * supported_d;
cudaCheck(cudaMalloc(&supported_d, sizeof(bool)));
cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool)));
isSupported<<<1,1>>>(supported_d);
// swallow any eventual error from launching the kernel on an unsupported device
cudaGetLastError();
cudaCheck(cudaDeviceSynchronize());
cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost));
cudaCheck(cudaFree(supported_d));
if (supported) {
supportedDevices.push_back(i);
}
cudaCheck(cudaDeviceReset());
}

return supportedDevices;
}
11 changes: 5 additions & 6 deletions HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
Expand All @@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {

// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
int deviceCount = 0;
auto ret = cudaGetDeviceCount( &deviceCount );
int deviceCount = supportedCUDADevices().size();

if( ret != cudaSuccess ) {
WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
<< ret << ") " << cudaGetErrorString( ret )
<< ". Running only tests not requiring devices.");
if (deviceCount == 0) {
WARN("No supported CUDA devices available. Running only tests not requiring devices.");
}

SECTION("CUDAService enabled") {
Expand All @@ -58,6 +56,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}

auto cs = makeCUDAService(ps, ar);
cudaError_t ret;

SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h

#include <vector>

std::vector<int> supportedCUDADevices();

#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
6 changes: 6 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

void exitSansCUDADevices() {
int devices = 0;
Expand All @@ -16,4 +17,9 @@ void exitSansCUDADevices() {
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
int supported = supportedCUDADevices().size();
if (supported == 0) {
std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
}
Loading

0 comments on commit 2be0bc1

Please sign in to comment.