Skip to content

Commit

Permalink
Merge pull request #34950 from fwyzard/CUDA_fix_concurrent_EventSetup…
Browse files Browse the repository at this point in the history
…_filling

Fix uploading the EventSetup conditions to multiple CUDA devices [11.3.x]
  • Loading branch information
cmsbuild authored Aug 23, 2021
2 parents 07247b0 + c9c01df commit 944e28b
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 12 deletions.
18 changes: 11 additions & 7 deletions HeterogeneousCore/CUDACore/interface/ESProduct.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "FWCore/Utilities/interface/thread_safety_macros.h"
#include "HeterogeneousCore/CUDAServices/interface/numberOfDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
Expand All @@ -19,23 +20,26 @@ namespace cms {
class ESProduct {
public:
ESProduct() : gpuDataPerDevice_(numberOfDevices()) {
for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
gpuDataPerDevice_[i].m_event = getEventCache().get();
if (not gpuDataPerDevice_.empty()) {
cms::cuda::ScopedSetDevice scopedDevice;
for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
scopedDevice.set(i);
gpuDataPerDevice_[i].m_event = getEventCache().get();
}
}
}

~ESProduct() = default;

// transferAsync should be a function of (T&, cudaStream_t)
// which enqueues asynchronous transfers (possibly kernels as well)
// to the CUDA stream
template <typename F>
const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
auto device = currentDevice();

int device = currentDevice();
auto& data = gpuDataPerDevice_[device];

// If GPU data has already been filled, we can return it
// immediately
// If the GPU data has already been filled, we can return it immediately
if (not data.m_filled.load()) {
// It wasn't, so need to fill it
std::scoped_lock<std::mutex> lk{data.m_mutex};
Expand Down Expand Up @@ -103,4 +107,4 @@ namespace cms {
} // namespace cuda
} // namespace cms

#endif
#endif // HeterogeneousCore_CUDACore_ESProduct_h
25 changes: 20 additions & 5 deletions HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,35 @@ namespace cms {
namespace cuda {
class ScopedSetDevice {
public:
explicit ScopedSetDevice(int newDevice) {
cudaCheck(cudaGetDevice(&prevDevice_));
cudaCheck(cudaSetDevice(newDevice));
// Store the original device, without setting a new one
ScopedSetDevice() {
// Store the original device
cudaCheck(cudaGetDevice(&originalDevice_));
}

// Store the original device, and set a new current device
explicit ScopedSetDevice(int device) : ScopedSetDevice() {
// Change the current device
set(device);
}

// Restore the original device
~ScopedSetDevice() {
// Intentionally don't check the return value to avoid
// exceptions to be thrown. If this call fails, the process is
// doomed anyway.
cudaSetDevice(prevDevice_);
cudaSetDevice(originalDevice_);
}

// Set a new current device, without changing the original device
// that will be restored when this object is destroyed
void set(int device) {
// Change the current device
cudaCheck(cudaSetDevice(device));
}

private:
int prevDevice_;
int originalDevice_;
};
} // namespace cuda
} // namespace cms
Expand Down

0 comments on commit 944e28b

Please sign in to comment.