diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml new file mode 100644 index 0000000000000..98033aab4d99d --- /dev/null +++ b/CUDADataFormats/Common/BuildFile.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h new file mode 100644 index 0000000000000..75c9c80e7f206 --- /dev/null +++ b/CUDADataFormats/Common/interface/CUDAProduct.h @@ -0,0 +1,55 @@ +#ifndef CUDADataFormats_Common_CUDAProduct_h +#define CUDADataFormats_Common_CUDAProduct_h + +#include + +#include "CUDADataFormats/Common/interface/CUDAProductBase.h" + +namespace edm { + template + class Wrapper; +} +namespace impl { + class CUDAScopedContextGetterBase; +} + +/** + * The purpose of this class is to wrap CUDA data to edm::Event in a + * way which forces correct use of various utilities. + * + * The non-default construction has to be done with CUDAScopedContext + * (in order to properly register the CUDA event). + * + * The default constructor is needed only for the ROOT dictionary generation. + * + * The CUDA event is in practice needed only for stream-stream + * synchronization, but someone with long-enough lifetime has to own + * it. Here is a somewhat natural place. If overhead is too much, we + * can use them only where synchronization between streams is needed. + */ +template +class CUDAProduct : public CUDAProductBase { +public: + CUDAProduct() = default; // Needed only for ROOT dictionary generation + + CUDAProduct(const CUDAProduct&) = delete; + CUDAProduct& operator=(const CUDAProduct&) = delete; + CUDAProduct(CUDAProduct&&) = default; + CUDAProduct& operator=(CUDAProduct&&) = default; + +private: + friend class impl::CUDAScopedContextGetterBase; + friend class CUDAScopedContextProduce; + friend class edm::Wrapper>; + + explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data) + : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {} + + template + explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args) + : CUDAProductBase(device, std::move(stream)), data_(std::forward(args)...) {} + + T data_; //! +}; + +#endif diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h new file mode 100644 index 0000000000000..219b7e619de7f --- /dev/null +++ b/CUDADataFormats/Common/interface/CUDAProductBase.h @@ -0,0 +1,90 @@ +#ifndef CUDADataFormats_Common_CUDAProductBase_h +#define CUDADataFormats_Common_CUDAProductBase_h + +#include +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h" + +namespace impl { + class CUDAScopedContextBase; +} + +/** + * Base class for all instantiations of CUDA to hold the + * non-T-dependent members. + */ +class CUDAProductBase { +public: + CUDAProductBase() = default; // Needed only for ROOT dictionary generation + ~CUDAProductBase(); + + CUDAProductBase(const CUDAProductBase&) = delete; + CUDAProductBase& operator=(const CUDAProductBase&) = delete; + CUDAProductBase(CUDAProductBase&& other) + : stream_{std::move(other.stream_)}, + event_{std::move(other.event_)}, + mayReuseStream_{other.mayReuseStream_.load()}, + device_{other.device_} {} + CUDAProductBase& operator=(CUDAProductBase&& other) { + stream_ = std::move(other.stream_); + event_ = std::move(other.event_); + mayReuseStream_ = other.mayReuseStream_.load(); + device_ = other.device_; + return *this; + } + + bool isValid() const { return stream_.get() != nullptr; } + bool isAvailable() const; + + int device() const { return device_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the CUDAScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaStream_t stream() const { return stream_.get(); } + + // cudaEvent_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the CUDAScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaEvent_t event() const { return event_ ? event_.get() : nullptr; } + +protected: + explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream) + : stream_{std::move(stream)}, device_{device} {} + +private: + friend class impl::CUDAScopedContextBase; + friend class CUDAScopedContextProduce; + + // The following functions are intended to be used only from CUDAScopedContext + void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); } + const cudautils::SharedStreamPtr& streamPtr() const { return stream_; } + + bool mayReuseStream() const { + bool expected = true; + bool changed = mayReuseStream_.compare_exchange_strong(expected, false); + // If the current thread is the one flipping the flag, it may + // reuse the stream. + return changed; + } + + // The cudaStream_t is really shared among edm::Event products, so + // using shared_ptr also here + cudautils::SharedStreamPtr stream_; //! + // shared_ptr because of caching in CUDAEventCache + cudautils::SharedEventPtr event_; //! + + // This flag tells whether the CUDA stream may be reused by a + // consumer or not. The goal is to have a "chain" of modules to + // queue their work to the same stream. + mutable std::atomic mayReuseStream_ = true; //! + + // The CUDA device associated with this product + int device_ = -1; //! +}; + +#endif diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc new file mode 100644 index 0000000000000..72302d3165676 --- /dev/null +++ b/CUDADataFormats/Common/src/CUDAProductBase.cc @@ -0,0 +1,27 @@ +#include "CUDADataFormats/Common/interface/CUDAProductBase.h" +#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h" + +bool CUDAProductBase::isAvailable() const { + // In absence of event, the product was available already at the end + // of produce() of the producer. + if (not event_) { + return true; + } + return cudautils::eventIsOccurred(event_.get()); +} + +CUDAProductBase::~CUDAProductBase() { + // Make sure that the production of the product in the GPU is + // complete before destructing the product. This is to make sure + // that the EDM stream does not move to the next event before all + // asynchronous processing of the current is complete. + if (event_) { + // TODO: a callback notifying a WaitingTaskHolder (or similar) + // would avoid blocking the CPU, but would also require more work. + // + // Intentionally not checking the return value to avoid throwing + // exceptions. If this call would fail, we should get failures + // elsewhere as well. + cudaEventSynchronize(event_.get()); + } +} diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml new file mode 100644 index 0000000000000..5e804fe80a736 --- /dev/null +++ b/CUDADataFormats/Common/test/BuildFile.xml @@ -0,0 +1,5 @@ + + + + + diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc new file mode 100644 index 0000000000000..3eb3115571813 --- /dev/null +++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc @@ -0,0 +1,68 @@ +#include "catch.hpp" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" + +#include + +namespace cudatest { + class TestCUDAScopedContext { + public: + static CUDAScopedContextProduce make(int dev, bool createEvent) { + cudautils::SharedEventPtr event; + if (createEvent) { + event = cudautils::getCUDAEventCache().getCUDAEvent(); + } + return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event)); + } + }; +} // namespace cudatest + +TEST_CASE("Use of CUDAProduct template", "[CUDACore]") { + SECTION("Default constructed") { + auto foo = CUDAProduct(); + REQUIRE(!foo.isValid()); + + auto bar = std::move(foo); + } + + if (not hasCUDADevices()) { + return; + } + + constexpr int defaultDevice = 0; + cudaCheck(cudaSetDevice(defaultDevice)); + { + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true); + std::unique_ptr> dataPtr = ctx.wrap(10); + auto& data = *dataPtr; + + SECTION("Construct from CUDAScopedContext") { + REQUIRE(data.isValid()); + REQUIRE(data.device() == defaultDevice); + REQUIRE(data.stream() == ctx.stream()); + REQUIRE(data.event() != nullptr); + } + + SECTION("Move constructor") { + auto data2 = CUDAProduct(std::move(data)); + REQUIRE(data2.isValid()); + REQUIRE(!data.isValid()); + } + + SECTION("Move assignment") { + CUDAProduct data2; + data2 = std::move(data); + REQUIRE(data2.isValid()); + REQUIRE(!data.isValid()); + } + } + + cudaCheck(cudaSetDevice(defaultDevice)); + cudaCheck(cudaDeviceSynchronize()); + // Note: CUDA resources are cleaned up by the destructors of the global cache objects +} diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc new file mode 100644 index 0000000000000..0c7c351f437f5 --- /dev/null +++ b/CUDADataFormats/Common/test/test_main.cc @@ -0,0 +1,2 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h index efc7d9e6cde0b..44f7b1ca14944 100644 --- a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h +++ b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h @@ -24,9 +24,9 @@ #include "tbb/task_arena.h" -namespace edm { +#include "FWCore/Concurrency/interface/WaitingTask.h" - class WaitingTask; +namespace edm { class WaitingTaskHolder; class WaitingTaskWithArenaHolder { @@ -72,5 +72,29 @@ namespace edm { WaitingTask* m_task; std::shared_ptr m_arena; }; + + template + auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) { + return [holder = std::move(h), func = std::forward(f)]() mutable { + try { + func(holder); + } catch (...) { + holder.doneWaiting(std::current_exception()); + } + }; + } + + template + auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) { + return make_waiting_task( + std::forward(iAlloc), + [holder = h, func = make_lambda_with_holder(h, std::forward(f))](std::exception_ptr const* excptr) mutable { + if (excptr) { + holder.doneWaiting(*excptr); + return; + } + func(); + }); + } } // namespace edm #endif diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml new file mode 100644 index 0000000000000..d78c8a28f0470 --- /dev/null +++ b/HeterogeneousCore/CUDACore/BuildFile.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md new file mode 100644 index 0000000000000..3948ae7e59f79 --- /dev/null +++ b/HeterogeneousCore/CUDACore/README.md @@ -0,0 +1,1003 @@ +# CUDA algorithms in CMSSW + +## Outline + +* [Introduction](#introduction) + * [Design goals](#design-goals) + * [Overall guidelines](#overall-guidelines) +* [Sub-packages](#sub-packages) +* [Examples](#examples) + * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output) + * [Producer with CUDA output](#producer-with-cuda-output) + * [Producer with CUDA input](#producer-with-cuda-input) + * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork) + * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork) + * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork) + * [Analyzer with CUDA input](#analyzer-with-cuda-input) + * [Configuration](#configuration) + * [GPU-only configuration](#gpu-only-configuration) + * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules) +* [More details](#more-details) + * [Device choice](#device-choice) + * [Data model](#data-model) + * [CUDA EDProducer](#cuda-edproducer) + * [Class declaration](#class-declaration) + * [Memory allocation](#memory-allocation) + * [Caching allocator](#caching-allocator) + * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr) + * [CUDA API](#cuda-api) + * [Setting the current device](#setting-the-current-device) + * [Getting input](#getting-input) + * [Calling the CUDA kernels](#calling-the-cuda-kernels) + * [Putting output](#putting-output) + * [`ExternalWork` extension](#externalwork-extension) + * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks) + * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu) + * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams) + * [CUDA ESProduct](#cuda-esproduct) + +## Introduction + +This page documents the CUDA integration within CMSSW + +### Design goals + +1. Provide a mechanism for a chain of modules to share a resource + * Resource can be e.g. CUDA device memory or a CUDA stream +2. Minimize data movements between the CPU and the device +3. Support multiple devices +4. Allow the same job configuration to be used on all hardware combinations + +### Overall guidelines + +1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e. + * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc. + * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event + * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator)) + * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h` + * With the latter the `assert()` calls in CUDA code are disabled by + default, but can be enabled by defining a `GPU_DEBUG` macro + (before the aforementioned include) +2. Synchronization needs should be fulfilled with + [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork) + extension to EDProducers + * `ExternalWork` can be used to replace one synchronization point + (e.g. between device kernels and copying a known amount of data + back to CPU). + * For further synchronization points (e.g. copying data whose + amount is known only at the device side), split the work to + multiple `ExternalWork` producers. This approach has the added + benefit that e.g. data transfers to CPU become on-demand automatically + * A general breakdown of the possible steps: + * Convert input legacy CPU data format to CPU SoA + * Transfer input CPU SoA to GPU + * Launch kernels + * Transfer the number of output elements to CPU + * Transfer the output data from GPU to CPU SoA + * Convert the output SoA to legacy CPU data formats +3. Within `acquire()`/`produce()`, the current CUDA device is set + implicitly and the CUDA stream is provided by the system (with + `CUDAScopedContextAcquire`/`CUDAScopedContextProduce`) + * It is strongly recommended to use the provided CUDA stream for all operations + * If that is not feasible for some reason, the provided CUDA + stream must synchronize with the work queued on other CUDA + streams (with CUDA events and `cudaStreamWaitEvent()`) +4. Outside of `acquire()`/`produce()`, CUDA API functions may be + called only if `CUDAService::enabled()` returns `true`. + * With point 3 it follows that in these cases multiple devices have + to be dealt with explicitly, as well as CUDA streams + +## Sub-packages +* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components +* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA +* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code +* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations +* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data + +## Examples + +### Isolated producer (no CUDA input nor output) + +```cpp +class IsolatedProducerCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + IsolatedProducerGPUAlgo gpuAlgo_; + edm::EDGetTokenT inputToken_; + edm::EDPutTokenT outputToken_; +}; +... +void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Sets the current device and creates a CUDA stream + CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; + + auto const& inputData = iEvent.get(inputToken_); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo_.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // Real life is likely more complex than this simple example. Here + // getResult() returns some data in CPU memory that is passed + // directly to the OutputData constructor. + iEvent.emplace(outputToken_, gpuAlgo_.getResult()); +} +``` + +### Producer with CUDA output + +```cpp +class ProducerOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + ProducerOutputGPUAlgo gpuAlgo_; + edm::EDGetTokenT inputToken_; + edm::EDPutTokenT> outputToken_; + CUDAContextState ctxState_; +}; +... +void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + // Sets the current device and creates a CUDA stream + CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_}; + + auto const& inputData = iEvent.get(inputToken_); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished, + // and saves the device and CUDA stream to ctxState_ +} + +// Called after the asynchronous work has finished +void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContextProduce ctx{ctxState_}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +### Producer with CUDA input + +```cpp +class ProducerInputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDGetTokenT> otherInputToken_; + edm::EDPutTokenT outputToken_; +}; +... +void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and possibly use the same CUDA stream + CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAcquire holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Input data from another producer + auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_)); + // or + auto const& otherInputData = ctx.get(iEvent, otherInputToken_); + + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work has finished +void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Real life is likely more complex than this simple example. Here + // getResult() returns some data in CPU memory that is passed + // directly to the OutputData constructor. + iEvent.emplace(outputToken_, gpuAlgo_.getResult()); +} +``` + +See [further below](#setting-the-current-device) for the conditions +when the `CUDAScopedContextAcquire` constructor reuses the CUDA stream. Note +that the `CUDAScopedContextAcquire` constructor taking `edm::StreamID` is +allowed, it will just always create a new CUDA stream. + + +### Producer with CUDA input and output (with ExternalWork) + +```cpp +class ProducerInputOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; +}; +... +void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and also use the same CUDA stream + CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAcquire holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished, + // and saves the device and CUDA stream to ctxState_ +} + +// Called after the asynchronous work has finished +void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContextProduce ctx{ctxState_}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc) + + +### Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork) + +```cpp +class ProducerInputOutputCUDA: public edm::stream::EDProducer { +public: + ... + void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; + void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override; + ... +private: + void addMoreWork(edm::WaitingTaskWithArenaHolder waitingTashHolder); + + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; +}; +... +void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and also use the same CUDA stream + CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAcquire holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAcquire::stream() + gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Push a functor on top of "a stack of tasks" to be run as a next + // task after the work queued above before produce(). In this case ctx + // is a context constructed by the calling TBB task, and therefore the + // current device and CUDA stream have been already set up. The ctx + // internally holds the WaitingTaskWithArenaHolder for the next task. + + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + addMoreWork(ctx); + }); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished, + // and saves the device and CUDA stream to ctxState_ +} + +// Called after the asynchronous work queued in acquire() has finished +void ProducerInputOutputCUDA::addMoreWork(CUDAScopedContextTask& ctx) { + // Current device and CUDA stream have already been set + + // Queues more asynchronous data transfer and kernels to the CUDA + // stream returned by CUDAScopedContextTask::stream() + gpuAlgo.makeMoreAsync(ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} + +// Called after the asynchronous work queued in addMoreWork() has finished +void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { + // Sets again the current device, uses the CUDA stream created in the acquire() + CUDAScopedContextProduce ctx{ctxState_}; + + // Now getResult() returns data in GPU memory that is passed to the + // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the + // OutputData to CUDAProduct. CUDAProduct stores also + // the current device and the CUDA stream since those will be needed + // in the consumer side. + ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEWTask.cc) + + +### Producer with CUDA input and output (without ExternalWork) + +If the producer does not need to transfer anything back to CPU (like +the number of output elements), the `ExternalWork` extension is not +needed as there is no need to synchronize. + +```cpp +class ProducerInputOutputCUDA: public edm::global::EDProducer<> { +public: + ... + void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override; + ... +private: + ... + ProducerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDPutTokenT> outputToken_; +}; +... +void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and possibly use the same CUDA stream + CUDAScopedContextProduce ctx{inputDataWrapped}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextProduce holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextProduce::stream(). Here makeAsync() also + // returns data in GPU memory that is passed to the constructor of + // OutputData. CUDAScopedContextProduce::emplace() wraps the OutputData to + // CUDAProduct. CUDAProduct stores also the current + // device and the CUDA stream since those will be needed in the + // consumer side. + ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream()); + + // Destructor of ctx queues a callback to the CUDA stream notifying + // waitingTaskHolder when the queued asynchronous work has finished +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc) + + +### Analyzer with CUDA input + +Analyzer with CUDA input is similar to [producer with CUDA +input](#producer-with-cuda-input). Note that currently we do not have +a mechanism for portable configurations with analyzers (like +[`SwitchProducer`](#automatic-switching-between-cpu-and-gpu-modules) +for producers). This means that a configuration with a CUDA analyzer +can only run on a machine with CUDA device(s). + +```cpp +class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> { +public: + ... + void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override; + ... +private: + ... + AnalyzerInputGPUAlgo gpuAlgo_; + edm::EDGetTokenT> inputToken_; + edm::EDGetTokenT> otherInputToken_; +}; +... +void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) { + CUDAProduct const& inputDataWrapped = iEvent.get(inputToken_); + + // Set the current device to the same that was used to produce + // InputData, and possibly use the same CUDA stream + CUDAScopedContextAnalyze ctx{inputDataWrapped}; + + // Grab the real input data. Checks that the input data is on the + // current device. If the input data was produced in a different CUDA + // stream than the CUDAScopedContextAnalyze holds, create an inter-stream + // synchronization point with CUDA event and cudaStreamWaitEvent() + auto const& inputData = ctx.get(inputDataWrapped); + + // Input data from another producer + auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_)); + // or + auto const& otherInputData = ctx.get(iEvent, otherInputToken_); + + + // Queues asynchronous data transfers and kernels to the CUDA stream + // returned by CUDAScopedContextAnalyze::stream() + gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream()); +} +``` + +[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc) + + +### Configuration + +#### GPU-only configuration + +For a GPU-only configuration there is nothing special to be done, just +construct the Paths/Sequences/Tasks from the GPU modules. + +#### Automatic switching between CPU and GPU modules + +The `SwitchProducer` mechanism can be used to switch automatically +between CPU and GPU modules based on the availability of GPUs on the +machine where the configuration is done. Framework decides at the +beginning of the job which of the modules to run for a given module +label. + +Framework requires that the modules in the switch must produce the +same types of output products (the closer the actual results are the +better, but the framework can not enforce that). This means that for a +chain of GPU modules, it is the module that transforms the SoA data +format back to the legacy data formats (possibly, but not necessarily, +transferring the SoA data from GPU to CPU) that should be switched +between the legacy CPU module. The rest of the GPU modules should be +placed to a `Task`, in which case framework runs them only if their +output is needed by another module. + +```python +from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA +process.foo = SwitchProducerCUDA( + cpu = cms.EDProducer("FooProducer"), # legacy CPU + cuda = cms.EDProducer("FooProducerFromCUDA", + src="fooCUDA" + ) +) +process.fooCUDA = cms.EDProducer("FooProducerCUDA") + +process.fooTaskCUDA = cms.Task(process.fooCUDA) +process.fooTask = cms.Task( + process.foo, + process.fooTaskCUDA +) +``` + +For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py). + + + + + +## More details + +### Device choice + +As discussed above, with `SwitchProducer` the choice between CPU and +GPU modules is done at the beginning of the job. + +For multi-GPU setup the device is chosen in the first CUDA module in a +chain of modules by one of the constructors of +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce` +```cpp +// In ExternalWork acquire() +CUDAScopedContextAcquire ctx{iEvent.streamID(), ...}; + +// In normal produce() (or filter()) +CUDAScopedContextProduce ctx{iEvent.streamID()}; +``` +As the choice is still the static EDM stream to device assignment, the +EDM stream ID is needed. The logic will likely evolve in the future to +be more dynamic, and likely the device choice has to be made for the +full event. + +### Data model + +The "GPU data product" should be a class/struct containing smart +pointer(s) to device data (see [Memory allocation](#memory-allocation)). +When putting the data to event, the data is wrapped to +`CUDAProduct` template, which holds +* the GPU data product + * must be moveable, but no other restrictions +* the current device where the data was produced, and the CUDA stream the data was produced with +* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams) + +Note that the `CUDAProduct` wrapper can be constructed only with +`CUDAScopedContextProduce::wrap()`, and the data `T` can be obtained +from it only with +`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`, +as described further below. When putting the data product directly to +`edm::Event`, also `CUDASCopedContextProduce::emplace()` can be used. + +The GPU data products that depend on the CUDA runtime should be placed +under `CUDADataFormats` package, using the same name for sub-package +that would be used in `DataFormats`. Everything else, e.g. SoA for +CPU, should go under `DataFormats` as usual. + + +### CUDA EDProducer + +#### Class declaration + +The CUDA producers are normal EDProducers. The `ExternalWork` +extension should be used if a synchronization between the GPU and CPU +is needed, e.g. when transferring data from GPU to CPU. + +#### Memory allocation + +##### Caching allocator + +The memory allocations should be done dynamically with the following functions +```cpp +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" + +cudautils::device::unique_ptr device_buffer = cudautils::make_device_unique(50, cudaStream); +cudautils::host::unique_ptr host_buffer = cudautils::make_host_unique(50, cudaStream); +``` + +in the `acquire()` and `produce()` functions. The same +`cudaStream_t` object that is used for transfers and kernels +should be passed to the allocator. + +The allocator is based on `cub::CachingDeviceAllocator`. The memory is +guaranteed to be reserved +* for the host: up to the destructor of the `unique_ptr` +* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished + +##### Non-cached pinned host `unique_ptr` + +In producers transferring data to GPU one may want to pinned host +memory allocated with `cudaHostAllocWriteCombined`. As of now we don't +want to include the flag dimension to the caching allocator. The CUDA +API wrapper library does not support allocation flags, so we add our +own `unique_ptr` for that. + +```cpp +#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" + +cudautils::host::noncached_unique_ptr host_buffer = cudautils::make_host_noncached_unique(50, flags); +``` +The `flags` is passed directly to `cudaHostAlloc()`. + +##### CUDA API + +The `cudaMalloc()` etc may be used outside of the event loop, but that +should be limited to only relatively small allocations in order to +allow as much re-use of device memory as possible. + +If really needed, the `cudaMalloc()` etc may be used also within the +event loop, but then the cost of allocation and implicit +synchronization should be explicitly amortized e.g. by caching. + +#### Setting the current device + +A CUDA producer should construct `CUDAScopedContextAcquire` in +`acquire()` (`CUDAScopedContextProduce` `produce()` if not using +`ExternalWork`) either with `edm::StreamID`, or with a +`CUDAProduct` read as an input. + +```cpp +// From edm::StreamID +CUDAScopedContextAcquire ctx{iEvent.streamID(), ...}; +// or +CUDAScopedContextProduce ctx{iEvent.streamID()}; + + +// From CUDAProduct +CUDAProduct const& cclus = iEvent.get(srcToken_); +CUDAScopedContextAcquire ctx{cclus, ...}; +// or +CUDAScopedContextProduce ctx{cclus}; +``` + +A CUDA analyzer should construct `CUDAScopedContextAnalyze` with a +`CUDAProduct` read as an input. + +```cpp +CUDAProduct const& cclus = iEvent.get(srcToken_); +CUDAScopedContextAnalyze ctx{cclus}; +``` + +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze` work in the RAII way and does the following +* Sets the current device for the current scope + - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream + - If constructed from the `CUDAProduct`, uses the same device and possibly the same CUDA stream as was used to produce the `CUDAProduct` + * The CUDA stream is reused if this producer is the first consumer + of the `CUDAProduct`, otherwise a new CUDA stream is created. + This approach is simple compromise to automatically express the work of + parallel producers in different CUDA streams, and at the same + time allow a chain of producers to queue their work to the same + CUDA stream. +* Gives access to the CUDA stream the algorithm should use to queue asynchronous work +* `CUDAScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor) +* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams) +* Needed to get `CUDAProduct` from the event + * `CUDAScopedContextProduce` is needed to put `CUDAProduct` to the event + +In case of multiple input products, from possibly different CUDA +streams and/or CUDA devices, this approach gives the developer full +control in which of them the kernels of the algorithm should be run. + +#### Getting input + +The real product (`T`) can be obtained from `CUDAProduct` only with +the help of +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`. + +```cpp +// From CUDAProduct +CUDAProduct cclus = iEvent.get(srcToken_); +GPUClusters const& clus = ctx.get(cclus); + +// Directly from Event +GPUClusters const& clus = ctx.get(iEvent, srcToken_); +``` + +This step is needed to +* check that the data are on the same CUDA device + * if not, throw an exception (with unified memory could prefetch instead) +* if the CUDA streams are different, synchronize between them + +#### Calling the CUDA kernels + +It is usually best to wrap the CUDA kernel calls to a separate class, +and then call methods of that class from the EDProducer. The only +requirement is that the CUDA stream where to queue the operations +should be the one from the +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`. + +```cpp +gpuAlgo.makeClustersAsync(..., ctx.stream()); +``` + +If necessary, different CUDA streams may be used internally, but they +should to be made to synchronize with the provided CUDA stream with +CUDA events and `cudaStreamWaitEvent()`. + + +#### Putting output + +The GPU data needs to be wrapped to `CUDAProduct` template with +`CUDAScopedContextProduce::wrap()` or `CUDAScopedContextProduce::emplace()` + +```cpp +GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream()); +std::unique_ptr> ret = ctx.wrap(clusters); +iEvent.put(std::move(ret)); + +// or with one line +iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream()))); + +// or avoid one unique_ptr with emplace +edm::PutTokenT> putToken_ = produces>(); // in constructor +... +ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream())); +``` + +This step is needed to +* store the current device and CUDA stream into `CUDAProduct` +* record the CUDA event needed for CUDA stream synchronization + +#### `ExternalWork` extension + +Everything above works both with and without `ExternalWork`. + +Without `ExternalWork` the `EDProducer`s act similar to TBB +flowgraph's "streaming node". In other words, they just queue more +asynchronous work to the CUDA stream in their `produce()`. + +The `ExternalWork` is needed when one would otherwise call +`cudeStreamSynchronize()`. For example transferring something to CPU +needed for downstream DQM, or queueing more asynchronous work. With +`ExternalWork` an `acquire()` method needs to be implemented that gets +an `edm::WaitingTaskWithArenaHolder` parameter. The +`edm::WaitingTaskWithArenaHolder` should then be passed to the +constructor of `CUDAScopedContextAcquire` along + +```cpp +void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + CUDAProduct const& cclus = iEvent.get(token_); + CUDAScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well + ... +``` + +When constructed this way, `CUDAScopedContextAcquire` registers a +callback function to the CUDA stream in its destructor to call +`waitingTaskHolder.doneWaiting()`. + +A GPU->GPU producer needs a `CUDAScopedContext` also in its +`produce()`. The device and CUDA stream are transferred via +`CUDAContextState` member variable: + +```cpp +class FooProducerCUDA ... { + ... + CUDAContextState ctxState_; +}; + +void acquire(...) { + ... + FooProducerCUDA::CUDAScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_}; + ... +} + +void produce(...( { + ... + FooProducerCUDA::CUDAScopedContextProduce ctx{ctxState_}; +} +``` + +The `CUDAScopedContextAcquire` saves its state to the `ctxState_` in +the destructor, and `CUDAScopedContextProduce` then restores the +context. + +#### Module-internal chain of CPU and GPU tasks + +Technically `ExternalWork` works such that the framework calls +`acquire()` with a `edm::WaitingTaskWithArenaHolder` that holds an +`edm::WaitingTask` (that inherits from `tbb::task`) for calling +`produce()` in a `std::shared_ptr` semantics: spawn the task when +reference count hits `0`. It is also possible to create a longer chain +of such tasks, alternating between CPU and GPU work. This mechanism +can also be used to re-run (part of) the GPU work. + +The "next tasks" to run are essentially structured as a stack, such +that +- `CUDAScopedContextAcquire`/`CUDAScopedContextTask::pushNextTask()` + pushes a new functor on top of the stack +- Completion of both the asynchronous work and the queueing function + pops the top task of the stack and enqueues it (so that TBB + eventually runs the task) + * Technically the task is made eligible to run when all copies of + `edm::WaitingTaskWithArenaHolder` of the acquire() (or "previous" + function) have either been destructed or their `doneWaiting()` has + been called + * The code calling `acquire()` or the functor holds one copy of + `edm::WaitingTaskWithArenaHolder` so it is guaranteed that the + next function will not run before the earlier one has finished + + +Below is an example how to push a functor on top of the stack of tasks +to run next (following the example of the previous section) +```cpp +void FooProducerCUDA::acquire(...) { + ... + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... + }); + ... +} +``` + +In this case the `ctx`argument to the function is a +`CUDAScopedContexTask` object constructed by the TBB task calling the +user-given function. It follows that the current device and CUDA +stream have been set up already. The `pushNextTask()` can be called +many times. On each invocation the `pushNextTask()` pushes a new task +on top of the stack (i.e. in front of the chain). It follows that in +```cpp +void FooProducerCUDA::acquire(...) { + ... + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... // function 1 + }); + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... // function 2 + }); + ctx.pushNextTask([this](CUDAScopedContextTask ctx) { + ... // function 3 + }); + ... +} +``` +the functions will be run in the order 3, 2, 1. + +**Note** that the `CUDAService` is **not** available (nor is any other +service) in these intermediate tasks. In the near future memory +allocations etc. will be made possible by taking them out from the +`CUDAService`. + +The `CUDAScopedContextAcquire`/`CUDAScopedContextTask` have also a +more generic member function, `replaceWaitingTaskHolder()`, that can +be used to just replace the currently-hold +`edm::WaitingTaskWithArenaHolder` (that will get notified by the +callback function) with anything. In this case the caller is +responsible of creating the task(s) and setting up the chain of them. + + +#### Transferring GPU data to CPU + +The GPU->CPU data transfer needs synchronization to ensure the CPU +memory to have all data before putting that to the event. This means +the `ExternalWork` needs to be used along +* In `acquire()` + * (allocate CPU memory buffers) + * Queue all GPU->CPU transfers asynchronously +* In `produce()` + * If needed, read additional CPU products (e.g. from `edm::Ref`s) + * Reformat data back to legacy data formats + * Note: `CUDAScopedContextProduce` is **not** needed in `produce()` + +#### Synchronizing between CUDA streams + +In case the producer needs input data that were produced in two (or +more) CUDA streams, these streams have to be synchronized. Here this +synchronization is achieved with CUDA events. + +Each `CUDAProduct` constains also a CUDA event object. The call to +`CUDAScopedContextProduce::wrap()` will *record* the event in the CUDA +stream. This means that when all work queued to the CUDA stream up to +that point has been finished, the CUDA event becomes *occurred*. Then, +in +`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`, +if the `CUDAProduct` to get from has a different CUDA stream than +the +`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`, +`cudaStreamWaitEvent(stream, event)` is called. This means that all +subsequent work queued to the CUDA stream will wait for the CUDA event +to become occurred. Therefore this subsequent work can assume that the +to-be-getted CUDA product exists. + + +### CUDA ESProduct + +Conditions data can be transferred to the device with the following +pattern. + +1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload") +2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory +3. The wrapper should have a function returning the payload on the + device memory. The function should transfer the data to the device + asynchronously with the help of `CUDAESProduct`. + +#### Example + +```cpp +#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h" + +// Declare the struct for the payload to be transferred. Here the +// example is an array with (potentially) dynamic size. Note that all of +// below becomes simpler if the array has compile-time size. +struct ESProductExampleCUDA { + float *someData; + unsigned int size; +}; + +// Declare the wrapper ESProduct. The corresponding ESProducer should +// produce objects of this type. +class ESProductExampleCUDAWrapper { +public: + // Constructor takes the standard CPU ESProduct, and transforms the + // necessary data to array(s) in pinned host memory + ESProductExampleCUDAWrapper(ESProductExample const&); + + // Deallocates all pinned host memory + ~ESProductExampleCUDAWrapper(); + + // Function to return the actual payload on the memory of the current device + ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const; + +private: + // Holds the data in pinned CPU memory + float *someData_; + unsigned int size_; + + // Helper struct to hold all information that has to be allocated and + // deallocated per device + struct GPUData { + // Destructor should free all member pointers + ~GPUData(); + // internal pointers are on device, struct itself is on CPU + ESProductExampleCUDA *esproductHost = nullptr; + // internal pounters and struct are on device + ESProductExampleCUDA *esproductDevice = nullptr; + }; + + // Helper that takes care of complexity of transferring the data to + // multiple devices + CUDAESProduct gpuData_; +}; + +ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) { + cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS)); + // fill someData_ and size_ from cpuProduct +} + +ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const { + // CUDAESProduct essentially holds an array of GPUData objects, + // one per device. If the data have already been transferred to the + // current device (or the transfer has been queued), the helper just + // returns a reference to that GPUData object. Otherwise, i.e. data are + // not yet on the current device, the helper calls the lambda to do the + // necessary memory allocations and to queue the transfers. + auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) { + // Allocate memory. Currently this can be with the CUDA API, + // sometime we'll migrate to the caching allocator. Assumption is + // that IOV changes are rare enough that adding global synchronization + // points is not that bad (for now). + + // Allocate the payload object on pinned host memory. + cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA))); + // Allocate the payload array(s) on device memory. + cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS)); + + // Allocate the payload object on the device memory. + cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice))); + + // Complete the host-side information on the payload + data.cablingMapHost->size = this->size_; + + + // Transfer the payload, first the array(s) ... + cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream)); + // ... and then the payload object + cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream)); +}); + + // Returns the payload object on the memory of the current device + return data.esproductDevice; +} + +// Destructor frees all member pointers +ESProductExampleCUDA::GPUData::~GPUData() { + if(esproductHost != nullptr) { + cudaCheck(cudaFree(esproductHost->someData)); + cudaCheck(cudaFreeHost(esproductHost)); + } + cudaCheck(cudaFree(esProductDevice)); +} + +``` diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextState.h b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h new file mode 100644 index 0000000000000..b3c20dcb73159 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h @@ -0,0 +1,57 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h +#define HeterogeneousCore_CUDACore_CUDAContextState_h + +#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" + +#include + +/** + * The purpose of this class is to deliver the device and CUDA stream + * information from ExternalWork's acquire() to producer() via a + * member/StreamCache variable. + */ +class CUDAContextState { +public: + CUDAContextState() = default; + ~CUDAContextState() = default; + + CUDAContextState(const CUDAContextState&) = delete; + CUDAContextState& operator=(const CUDAContextState&) = delete; + CUDAContextState(CUDAContextState&&) = delete; + CUDAContextState& operator=(CUDAContextState&& other) = delete; + +private: + friend class CUDAScopedContextAcquire; + friend class CUDAScopedContextProduce; + friend class CUDAScopedContextTask; + + void set(int device, cudautils::SharedStreamPtr stream) { + throwIfStream(); + device_ = device; + stream_ = std::move(stream); + } + + int device() const { return device_; } + + const cudautils::SharedStreamPtr& streamPtr() const { + throwIfNoStream(); + return stream_; + } + + cudautils::SharedStreamPtr releaseStreamPtr() { + throwIfNoStream(); + // This function needs to effectively reset stream_ (i.e. stream_ + // must be empty after this function). This behavior ensures that + // the SharedStreamPtr is not hold for inadvertedly long (i.e. to + // the next event), and is checked at run time. + return std::move(stream_); + } + + void throwIfStream() const; + void throwIfNoStream() const; + + cudautils::SharedStreamPtr stream_; + int device_; +}; + +#endif diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h new file mode 100644 index 0000000000000..b8b230e510fa3 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h @@ -0,0 +1,100 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h +#define HeterogeneousCore_CUDACore_CUDAESProduct_h + +#include +#include +#include +#include + +#include "FWCore/Concurrency/interface/hardware_pause.h" +#include "FWCore/Utilities/interface/thread_safety_macros.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h" + +template +class CUDAESProduct { +public: + CUDAESProduct() : gpuDataPerDevice_(cudautils::cudaDeviceCount()) { + for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) { + gpuDataPerDevice_[i].m_event = cudautils::getCUDAEventCache().getCUDAEvent(); + } + } + ~CUDAESProduct() = default; + + // transferAsync should be a function of (T&, cudaStream_t) + // which enqueues asynchronous transfers (possibly kernels as well) + // to the CUDA stream + template + const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const { + auto device = cudautils::currentDevice(); + + auto& data = gpuDataPerDevice_[device]; + + // If GPU data has already been filled, we can return it + // immediately + if (not data.m_filled.load()) { + // It wasn't, so need to fill it + std::scoped_lock lk{data.m_mutex}; + + if (data.m_filled.load()) { + // Other thread marked it filled while we were locking the mutex, so we're free to return it + return data.m_data; + } + + if (data.m_fillingStream != nullptr) { + // Someone else is filling + + // Check first if the recorded event has occurred + if (cudautils::eventIsOccurred(data.m_event.get())) { + // It was, so data is accessible from all CUDA streams on + // the device. Set the 'filled' for all subsequent calls and + // return the value + auto should_be_false = data.m_filled.exchange(true); + assert(not should_be_false); + data.m_fillingStream = nullptr; + } else if (data.m_fillingStream != cudaStream) { + // Filling is still going on. For other CUDA stream, add + // wait on the CUDA stream and return the value. Subsequent + // work queued on the stream will wait for the event to + // occur (i.e. transfer to finish). + cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0), + "Failed to make a stream to wait for an event"); + } + // else: filling is still going on. But for the same CUDA + // stream (which would be a bit strange but fine), we can just + // return as all subsequent work should be enqueued to the + // same CUDA stream (or stream to be explicitly synchronized + // by the caller) + } else { + // Now we can be sure that the data is not yet on the GPU, and + // this thread is the first to try that. + transferAsync(data.m_data, cudaStream); + assert(data.m_fillingStream == nullptr); + data.m_fillingStream = cudaStream; + // Now the filling has been enqueued to the cudaStream, so we + // can return the GPU data immediately, since all subsequent + // work must be either enqueued to the cudaStream, or the cudaStream + // must be synchronized by the caller + } + } + + return data.m_data; + } + +private: + struct Item { + mutable std::mutex m_mutex; + CMS_THREAD_GUARD(m_mutex) mutable cudautils::SharedEventPtr m_event; + // non-null if some thread is already filling (cudaStream_t is just a pointer) + CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr; + mutable std::atomic m_filled = false; // easy check if data has been filled already or not + CMS_THREAD_GUARD(m_mutex) mutable T m_data; + }; + + std::vector gpuDataPerDevice_; +}; + +#endif diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h new file mode 100644 index 0000000000000..758218bb958a2 --- /dev/null +++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h @@ -0,0 +1,252 @@ +#ifndef HeterogeneousCore_CUDACore_CUDAScopedContext_h +#define HeterogeneousCore_CUDACore_CUDAScopedContext_h + +#include + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h" +#include "FWCore/Framework/interface/Event.h" +#include "FWCore/Utilities/interface/EDGetToken.h" +#include "FWCore/Utilities/interface/EDPutToken.h" +#include "FWCore/Utilities/interface/StreamID.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" + +namespace cudatest { + class TestCUDAScopedContext; +} + +namespace impl { + // This class is intended to be derived by other CUDAScopedContext*, not for general use + class CUDAScopedContextBase { + public: + int device() const { return currentDevice_; } + + // cudaStream_t is a pointer to a thread-safe object, for which a + // mutable access is needed even if the CUDAScopedContext itself + // would be const. Therefore it is ok to return a non-const + // pointer from a const method here. + cudaStream_t stream() const { return stream_.get(); } + const cudautils::SharedStreamPtr& streamPtr() const { return stream_; } + + protected: + // The constructors set the current device device, but the device + // is not set back to the previous value at the destructor. This + // should be sufficient (and tiny bit faster) as all CUDA API + // functions relying on the current device should be called from + // the scope where this context is. The current device doesn't + // really matter between modules (or across TBB tasks). + explicit CUDAScopedContextBase(edm::StreamID streamID); + + explicit CUDAScopedContextBase(const CUDAProductBase& data); + + explicit CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream); + + private: + int currentDevice_; + cudautils::SharedStreamPtr stream_; + }; + + class CUDAScopedContextGetterBase : public CUDAScopedContextBase { + public: + template + const T& get(const CUDAProduct& data) { + synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event()); + return data.data_; + } + + template + const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) { + return get(iEvent.get(token)); + } + + protected: + template + CUDAScopedContextGetterBase(Args&&... args) : CUDAScopedContextBase(std::forward(args)...) {} + + void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent); + }; + + class CUDAScopedContextHolderHelper { + public: + CUDAScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : waitingTaskHolder_{std::move(waitingTaskHolder)} {} + + template + void pushNextTask(F&& f, CUDAContextState const* state); + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + waitingTaskHolder_ = std::move(waitingTaskHolder); + } + + void enqueueCallback(int device, cudaStream_t stream); + + private: + edm::WaitingTaskWithArenaHolder waitingTaskHolder_; + }; +} // namespace impl + +/** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextAcquire : public impl::CUDAScopedContextGetterBase { +public: + /// Constructor to create a new CUDA stream (no need for context beyond acquire()) + explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {} + + /// Constructor to create a new CUDA stream, and the context is needed after acquire() + explicit CUDAScopedContextAcquire(edm::StreamID streamID, + edm::WaitingTaskWithArenaHolder waitingTaskHolder, + CUDAContextState& state) + : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + + /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire()) + explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {} + + /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire() + explicit CUDAScopedContextAcquire(const CUDAProductBase& data, + edm::WaitingTaskWithArenaHolder waitingTaskHolder, + CUDAContextState& state) + : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {} + + ~CUDAScopedContextAcquire(); + + template + void pushNextTask(F&& f) { + if (contextState_ == nullptr) + throwNoState(); + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + +private: + void throwNoState(); + + impl::CUDAScopedContextHolderHelper holderHelper_; + CUDAContextState* contextState_ = nullptr; +}; + +/** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase { +public: + /// Constructor to create a new CUDA stream (non-ExternalWork module) + explicit CUDAScopedContextProduce(edm::StreamID streamID) : CUDAScopedContextGetterBase(streamID) {} + + /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module) + explicit CUDAScopedContextProduce(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {} + + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit CUDAScopedContextProduce(CUDAContextState& state) + : CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {} + + ~CUDAScopedContextProduce(); + + template + std::unique_ptr> wrap(T data) { + // make_unique doesn't work because of private constructor + // + // CUDAProduct constructor records CUDA event to the CUDA + // stream. The event will become "occurred" after all work queued + // to the stream before this point has been finished. + std::unique_ptr> ret(new CUDAProduct(device(), streamPtr(), std::move(data))); + createEventIfStreamBusy(); + ret->setEvent(event_); + return ret; + } + + template + auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) { + auto ret = iEvent.emplace(token, device(), streamPtr(), std::forward(args)...); + createEventIfStreamBusy(); + const_cast(*ret).setEvent(event_); + return ret; + } + +private: + friend class cudatest::TestCUDAScopedContext; + + // This construcor is only meant for testing + explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event) + : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {} + + void createEventIfStreamBusy(); + + cudautils::SharedEventPtr event_; +}; + +/** + * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire(): + * - setting the current device + * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextTask : public impl::CUDAScopedContextBase { +public: + /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module) + explicit CUDAScopedContextTask(CUDAContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder) + : CUDAScopedContextBase(state->device(), state->streamPtr()), // don't move, state is re-used afterwards + holderHelper_{std::move(waitingTaskHolder)}, + contextState_{state} {} + + ~CUDAScopedContextTask(); + + template + void pushNextTask(F&& f) { + holderHelper_.pushNextTask(std::forward(f), contextState_); + } + + void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) { + holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder)); + } + +private: + impl::CUDAScopedContextHolderHelper holderHelper_; + CUDAContextState const* contextState_; +}; + +/** + * The aim of this class is to do necessary per-event "initialization" in analyze() + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +/** + * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce(): + * - setting the current device + * - synchronizing between CUDA streams if necessary + * and enforce that those get done in a proper way in RAII fashion. + */ +class CUDAScopedContextAnalyze : public impl::CUDAScopedContextGetterBase { +public: + /// Constructor to (possibly) re-use a CUDA stream + explicit CUDAScopedContextAnalyze(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {} +}; + +namespace impl { + template + void CUDAScopedContextHolderHelper::pushNextTask(F&& f, CUDAContextState const* state) { + replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{ + edm::make_waiting_task_with_holder(tbb::task::allocate_root(), + std::move(waitingTaskHolder_), + [state, func = std::forward(f)](edm::WaitingTaskWithArenaHolder h) { + func(CUDAScopedContextTask{state, std::move(h)}); + })}); + } +} // namespace impl + +#endif diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py new file mode 100644 index 0000000000000..ded114e2fddfe --- /dev/null +++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py @@ -0,0 +1,34 @@ +import FWCore.ParameterSet.Config as cms + +_cuda_enabled_cached = None + +def _switch_cuda(): + global _cuda_enabled_cached + if _cuda_enabled_cached is None: + import os + _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0) + return (_cuda_enabled_cached, 2) + +class SwitchProducerCUDA(cms.SwitchProducer): + def __init__(self, **kargs): + super(SwitchProducerCUDA,self).__init__( + dict(cpu = cms.SwitchProducer.getCpu(), + cuda = _switch_cuda), + **kargs + ) +cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA") + +if __name__ == "__main__": + import unittest + + class TestSwitchProducerCUDA(unittest.TestCase): + def testPickle(self): + import pickle + sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar")) + pkl = pickle.dumps(sp) + unpkl = pickle.loads(pkl) + self.assertEqual(unpkl.cpu.type_(), "Foo") + self.assertEqual(unpkl.cuda.type_(), "Bar") + + unittest.main() + diff --git a/HeterogeneousCore/CUDACore/src/CUDAContextState.cc b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc new file mode 100644 index 0000000000000..bcdbae89d9094 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc @@ -0,0 +1,14 @@ +#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h" +#include "FWCore/Utilities/interface/Exception.h" + +void CUDAContextState::throwIfStream() const { + if (stream_) { + throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state"; + } +} + +void CUDAContextState::throwIfNoStream() const { + if (not stream_) { + throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state"; + } +} diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc new file mode 100644 index 0000000000000..df56c318e22fa --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc @@ -0,0 +1,134 @@ +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/Utilities/interface/Exception.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include "chooseCUDADevice.h" + +namespace { + struct CallbackData { + edm::WaitingTaskWithArenaHolder holder; + int device; + }; + + void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) { + std::unique_ptr guard{reinterpret_cast(data)}; + edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder; + int device = guard->device; + if (status == cudaSuccess) { + LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream " + << streamId; + waitingTaskHolder.doneWaiting(nullptr); + } else { + // wrap the exception in a try-catch block to let GDB "catch throw" break on it + try { + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device + << " error " << error << ": " << message; + } catch (cms::Exception&) { + waitingTaskHolder.doneWaiting(std::current_exception()); + } + } + } +} // namespace + +namespace impl { + CUDAScopedContextBase::CUDAScopedContextBase(edm::StreamID streamID) + : currentDevice_(cudacore::chooseCUDADevice(streamID)) { + cudaCheck(cudaSetDevice(currentDevice_)); + stream_ = cudautils::getCUDAStreamCache().getCUDAStream(); + } + + CUDAScopedContextBase::CUDAScopedContextBase(const CUDAProductBase& data) : currentDevice_(data.device()) { + cudaCheck(cudaSetDevice(currentDevice_)); + if (data.mayReuseStream()) { + stream_ = data.streamPtr(); + } else { + stream_ = cudautils::getCUDAStreamCache().getCUDAStream(); + } + } + + CUDAScopedContextBase::CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream) + : currentDevice_(device), stream_(std::move(stream)) { + cudaCheck(cudaSetDevice(currentDevice_)); + } + + //////////////////// + + void CUDAScopedContextGetterBase::synchronizeStreams(int dataDevice, + cudaStream_t dataStream, + bool available, + cudaEvent_t dataEvent) { + if (dataDevice != device()) { + // Eventually replace with prefetch to current device (assuming unified memory works) + // If we won't go to unified memory, need to figure out something else... + throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported"; + } + + if (dataStream != stream()) { + // Different streams, need to synchronize + if (not available) { + // Event not yet occurred, so need to add synchronization + // here. Sychronization is done by making the CUDA stream to + // wait for an event, so all subsequent work in the stream + // will run only after the event has "occurred" (i.e. data + // product became available). + cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event"); + } + } + } + + void CUDAScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) { + cudaCheck( + cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0)); + } +} // namespace impl + +//////////////////// + +CUDAScopedContextAcquire::~CUDAScopedContextAcquire() { + holderHelper_.enqueueCallback(device(), stream()); + if (contextState_) { + contextState_->set(device(), std::move(streamPtr())); + } +} + +void CUDAScopedContextAcquire::throwNoState() { + throw cms::Exception("LogicError") + << "Calling CUDAScopedContextAcquire::insertNextTask() requires CUDAScopedContextAcquire to be constructed with " + "CUDAContextState, but that was not the case"; +} + +//////////////////// + +CUDAScopedContextProduce::~CUDAScopedContextProduce() { + if (event_) { + cudaCheck(cudaEventRecord(event_.get(), stream())); + } +} + +void CUDAScopedContextProduce::createEventIfStreamBusy() { + if (event_) { + return; + } + auto ret = cudaStreamQuery(stream()); + if (ret == cudaSuccess) { + return; + } + if (ret != cudaErrorNotReady) { + // cudaErrorNotReady indicates that the stream is busy, and thus + // is not an error + cudaCheck(ret); + } + + event_ = cudautils::getCUDAEventCache().getCUDAEvent(); +} + +//////////////////// + +CUDAScopedContextTask::~CUDAScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); } diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc new file mode 100644 index 0000000000000..7e9ac2faed380 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc @@ -0,0 +1,18 @@ +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +#include "chooseCUDADevice.h" + +namespace cudacore { + int chooseCUDADevice(edm::StreamID id) { + edm::Service cudaService; + + // For startes we "statically" assign the device based on + // edm::Stream number. This is suboptimal if the number of + // edm::Streams is not a multiple of the number of CUDA devices + // (and even then there is no load balancing). + // + // TODO: improve the "assignment" logic + return id % cudaService->numberOfDevices(); + } +} // namespace cudacore diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h new file mode 100644 index 0000000000000..bb09c302af7f5 --- /dev/null +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h @@ -0,0 +1,10 @@ +#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h +#define HeterogeneousCore_CUDACore_chooseCUDADevice_h + +#include "FWCore/Utilities/interface/StreamID.h" + +namespace cudacore { + int chooseCUDADevice(edm::StreamID id); +} + +#endif diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml new file mode 100644 index 0000000000000..a6f34c70e8822 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu new file mode 100644 index 0000000000000..f819a78f698e4 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu @@ -0,0 +1,134 @@ +/** + * The purpose of this test program is to ensure that the logic for + * CUDA event use in CUDAProduct and CUDAScopedContext + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h" + +namespace { + constexpr int ARRAY_SIZE = 20000000; + constexpr int NLOOPS = 10; +} // namespace + +__global__ void kernel_looping(float *point, unsigned int num) { + unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x; + + for (int iloop = 0; iloop < NLOOPS; ++iloop) { + for (size_t offset = idx; offset < num; offset += gridDim.x * blockDim.x) { + point[offset] += 1; + } + } +} + +int main() { + requireCUDADevices(); + + constexpr bool debug = false; + + float *dev_points1; + float *host_points1; + cudaStream_t stream1, stream2; + cudaEvent_t event1, event2; + + cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float))); + cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float))); + cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking); + cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking); + cudaEventCreate(&event1); + cudaEventCreate(&event2); + + for (size_t j = 0; j < ARRAY_SIZE; ++j) { + host_points1[j] = static_cast(j); + } + + cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1)); + kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE); + if (debug) + std::cout << "Kernel launched on stream1" << std::endl; + + auto status = cudaStreamQuery(stream1); + if (debug) + std::cout << "Stream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + cudaEventRecord(event1, stream1); + status = cudaEventQuery(event1); + if (debug) + std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaErrorNotReady); + + status = cudaStreamQuery(stream2); + if (debug) + std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + assert(status == cudaSuccess); + if (debug) { + cudaEventRecord(event2, stream2); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + } + + cudaStreamWaitEvent(stream2, event1, 0); + if (debug) + std::cout << "\nStream2 waiting for event1" << std::endl; + status = cudaStreamQuery(stream2); + if (debug) + std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl; + assert(status == cudaErrorNotReady); + cudaEventRecord(event2, stream2); + status = cudaEventQuery(event2); + if (debug) + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaErrorNotReady); + if (debug) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + status = cudaEventQuery(event2); + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + } + + status = cudaStreamQuery(stream1); + if (debug) { + std::cout << "\nStream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) + << std::endl; + std::cout << "Synchronizing stream1" << std::endl; + } + assert(status == cudaErrorNotReady); + cudaStreamSynchronize(stream1); + if (debug) + std::cout << "Synchronized stream1" << std::endl; + + status = cudaEventQuery(event1); + if (debug) + std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaSuccess); + status = cudaEventQuery(event2); + if (debug) + std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess) + << std::endl; + assert(status == cudaSuccess); + + cudaFree(dev_points1); + cudaFreeHost(host_points1); + cudaStreamDestroy(stream1); + cudaStreamDestroy(stream2); + cudaEventDestroy(event1); + cudaEventDestroy(event2); + + return 0; +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc new file mode 100644 index 0000000000000..219e4dfb20103 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc @@ -0,0 +1,134 @@ +#include "catch.hpp" + +#include "CUDADataFormats/Common/interface/CUDAProduct.h" +#include "FWCore/Concurrency/interface/WaitingTask.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h" +#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h" + +#include "test_CUDAScopedContextKernels.h" + +namespace cudatest { + class TestCUDAScopedContext { + public: + static CUDAScopedContextProduce make(int dev, bool createEvent) { + cudautils::SharedEventPtr event; + if (createEvent) { + event = cudautils::getCUDAEventCache().getCUDAEvent(); + } + return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event)); + } + }; +} // namespace cudatest + +namespace { + std::unique_ptr> produce(int device, int* d, int* h) { + auto ctx = cudatest::TestCUDAScopedContext::make(device, true); + cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream())); + testCUDAScopedContextKernels_single(d, ctx.stream()); + return ctx.wrap(d); + } +} // namespace + +TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") { + if (not hasCUDADevices()) { + return; + } + + constexpr int defaultDevice = 0; + { + auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true); + + SECTION("Construct from device ID") { REQUIRE(cudautils::currentDevice() == defaultDevice); } + + SECTION("Wrap T to CUDAProduct") { + std::unique_ptr> dataPtr = ctx.wrap(10); + REQUIRE(dataPtr.get() != nullptr); + REQUIRE(dataPtr->device() == ctx.device()); + REQUIRE(dataPtr->stream() == ctx.stream()); + } + + SECTION("Construct from from CUDAProduct") { + std::unique_ptr> dataPtr = ctx.wrap(10); + const auto& data = *dataPtr; + + CUDAScopedContextProduce ctx2{data}; + REQUIRE(cudautils::currentDevice() == data.device()); + REQUIRE(ctx2.stream() == data.stream()); + + // Second use of a product should lead to new stream + CUDAScopedContextProduce ctx3{data}; + REQUIRE(cudautils::currentDevice() == data.device()); + REQUIRE(ctx3.stream() != data.stream()); + } + + SECTION("Storing state in CUDAContextState") { + CUDAContextState ctxstate; + { // acquire + std::unique_ptr> dataPtr = ctx.wrap(10); + const auto& data = *dataPtr; + edm::WaitingTaskWithArenaHolder dummy{ + edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr) {})}; + CUDAScopedContextAcquire ctx2{data, std::move(dummy), ctxstate}; + } + + { // produce + CUDAScopedContextProduce ctx2{ctxstate}; + REQUIRE(cudautils::currentDevice() == ctx.device()); + REQUIRE(ctx2.stream() == ctx.stream()); + } + } + + SECTION("Joining multiple CUDA streams") { + cudautils::ScopedSetDevice setDeviceForThisScope(defaultDevice); + + // Mimick a producer on the first CUDA stream + int h_a1 = 1; + auto d_a1 = cudautils::make_device_unique(nullptr); + auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1); + + // Mimick a producer on the second CUDA stream + int h_a2 = 2; + auto d_a2 = cudautils::make_device_unique(nullptr); + auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2); + + REQUIRE(wprod1->stream() != wprod2->stream()); + + // Mimick a third producer "joining" the two streams + CUDAScopedContextProduce ctx2{*wprod1}; + + auto prod1 = ctx2.get(*wprod1); + auto prod2 = ctx2.get(*wprod2); + + auto d_a3 = cudautils::make_device_unique(nullptr); + testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream()); + cudaCheck(cudaStreamSynchronize(ctx2.stream())); + REQUIRE(wprod2->isAvailable()); + REQUIRE(cudautils::eventIsOccurred(wprod2->event())); + + h_a1 = 0; + h_a2 = 0; + int h_a3 = 0; + + cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream())); + + REQUIRE(h_a1 == 2); + REQUIRE(h_a2 == 4); + REQUIRE(h_a3 == 6); + } + } + + cudaCheck(cudaSetDevice(defaultDevice)); + cudaCheck(cudaDeviceSynchronize()); + // Note: CUDA resources are cleaned up by the destructors of the global cache objects +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu new file mode 100644 index 0000000000000..330e83dfd4960 --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu @@ -0,0 +1,13 @@ +#include "test_CUDAScopedContextKernels.h" + +namespace { + __global__ void single_mul(int *d) { d[0] = d[0] * 2; } + + __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; } +} // namespace + +void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); } + +void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) { + join_add<<<1, 1, 0, stream>>>(d1, d2, d3); +} diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h new file mode 100644 index 0000000000000..527a4ce71e1cb --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h @@ -0,0 +1,9 @@ +#ifndef HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h +#define HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h + +#include + +void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream); +void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream); + +#endif diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc new file mode 100644 index 0000000000000..2e1027598a4de --- /dev/null +++ b/HeterogeneousCore/CUDACore/test/test_main.cc @@ -0,0 +1,31 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h" +#include "FWCore/PluginManager/interface/PluginManager.h" +#include "FWCore/PluginManager/interface/standard.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" + +class ServiceRegistryListener : public Catch::TestEventListenerBase { +public: + using Catch::TestEventListenerBase::TestEventListenerBase; // inherit constructor + + void testRunStarting(Catch::TestRunInfo const& testRunInfo) override { + edmplugin::PluginManager::configure(edmplugin::standard::config()); + + const std::string config{ + R"_(import FWCore.ParameterSet.Config as cms +process = cms.Process('Test') +process.CUDAService = cms.Service('CUDAService') +)_"}; + + std::unique_ptr params; + edm::makeParameterSets(config, params); + edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params))); + operate_.reset(new edm::ServiceRegistry::Operate(tempToken)); + } + +private: + std::unique_ptr operate_; +}; +CATCH_REGISTER_LISTENER(ServiceRegistryListener); diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml new file mode 100644 index 0000000000000..9320cad14f285 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/BuildFile.xml @@ -0,0 +1,11 @@ + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml new file mode 100644 index 0000000000000..041ed25ba134a --- /dev/null +++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp new file mode 100644 index 0000000000000..5a65575873116 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp @@ -0,0 +1,23 @@ +// C++ standard headers +#include +#include + +// CUDA headers +#include + +// CMSSW headers +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +int main() { + int devices = 0; + cudaCheck(cudaGetDeviceCount(&devices)); + + for (int i = 0; i < devices; ++i) { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, i); + std::cout << std::setw(4) << i << " " << std::setw(2) << properties.major << "." << properties.minor << " " + << properties.name << std::endl; + } + + return 0; +} diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp new file mode 100644 index 0000000000000..d901e1850bceb --- /dev/null +++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp @@ -0,0 +1,31 @@ +#include +#include +#include +#include + +#include + +int main() { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status != cudaSuccess) { + return EXIT_FAILURE; + } + + int minimumMajor = 6; // min minor is implicitly 0 + + // This approach (requiring all devices are supported) is rather + // conservative. In principle we could consider just dropping the + // unsupported devices. Currently that would be easiest to achieve + // in CUDAService though. + for (int i = 0; i < devices; ++i) { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, i); + + if (properties.major < minimumMajor) { + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; +} diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h new file mode 100644 index 0000000000000..625ce40fdcdc9 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -0,0 +1,46 @@ +#ifndef HeterogeneousCore_CUDAServices_CUDAService_h +#define HeterogeneousCore_CUDAServices_CUDAService_h + +#include +#include + +#include "FWCore/Utilities/interface/StreamID.h" + +namespace edm { + class ParameterSet; + class ActivityRegistry; + class ConfigurationDescriptions; +} // namespace edm + +/** + * TODO: + * - CUDA stream management? + * * Not really needed until we want to pass CUDA stream objects from one module to another + * * Which is not really needed until we want to go for "streaming mode" + * * Until that framework's inter-module synchronization is safe (but not necessarily optimal) + * - Management of (preallocated) memory? + */ +class CUDAService { +public: + CUDAService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry); + ~CUDAService(); + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + bool enabled() const { return enabled_; } + + int numberOfDevices() const { return numberOfDevices_; } + + // major, minor + std::pair computeCapability(int device) { return computeCapabilities_.at(device); } + + // Returns the id of device with most free memory. If none is found, returns -1. + int deviceWithMostFreeMemory() const; + +private: + int numberOfDevices_ = 0; + std::vector> computeCapabilities_; + bool enabled_ = false; +}; + +#endif diff --git a/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h new file mode 100644 index 0000000000000..b563b98b516cf --- /dev/null +++ b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h @@ -0,0 +1,9 @@ +#ifndef HeterogeneousCore_CUDAServices_numberOfCUDADevices_h +#define HeterogeneousCore_CUDAServices_numberOfCUDADevices_h + +// Returns the number of CUDA devices +// The difference wrt. the standard CUDA function is that if +// CUDAService is disabled, this function returns 0. +int numberOfCUDADevices(); + +#endif diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml index afcf86afdef75..81d4f20331ce3 100644 --- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml @@ -1,18 +1,15 @@ -#Skip building plugins by dropping all files for none-AMD64 build - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc new file mode 100644 index 0000000000000..6d8527935e334 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc @@ -0,0 +1,107 @@ +#include + +#include + +#include "DataFormats/Provenance/interface/ModuleDescription.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" +#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h" +#include "FWCore/ServiceRegistry/interface/Service.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +namespace edm { + class StreamContext; +} + +class CUDAMonitoringService { +public: + CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry); + ~CUDAMonitoringService() = default; + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions); + + void postModuleConstruction(edm::ModuleDescription const& desc); + void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc); + void postEvent(edm::StreamContext const& sc); + +private: + int numberOfDevices_ = 0; +}; + +CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) { + // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor + edm::Service cudaService; + if (!cudaService->enabled()) + return; + numberOfDevices_ = cudaService->numberOfDevices(); + + if (config.getUntrackedParameter("memoryConstruction")) { + registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction); + } + if (config.getUntrackedParameter("memoryBeginStream")) { + registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream); + } + if (config.getUntrackedParameter("memoryPerEvent")) { + registry.watchPostEvent(this, &CUDAMonitoringService::postEvent); + } +} + +void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.addUntracked("memoryConstruction", false) + ->setComment("Print memory information for each device after the construction of each module"); + desc.addUntracked("memoryBeginStream", true) + ->setComment("Print memory information for each device after the beginStream() of each module"); + desc.addUntracked("memoryPerEvent", true) + ->setComment("Print memory information for each device after each event"); + + descriptions.add("CUDAMonitoringService", desc); + descriptions.setComment( + "The memory information is the global state of the device. This gets confusing if there are multiple processes " + "running on the same device. Probably the information retrieval should be re-thought?"); +} + +// activity handlers +namespace { + template + void dumpUsedMemory(T& log, int num) { + int old = 0; + cudaCheck(cudaGetDevice(&old)); + for (int i = 0; i < num; ++i) { + size_t freeMemory, totalMemory; + cudaCheck(cudaSetDevice(i)); + cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); + log << "\n" + << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20) + << " MB total"; + } + cudaCheck(cudaSetDevice(old)); + } +} // namespace + +void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) { + auto log = edm::LogPrint("CUDAMonitoringService"); + log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { + auto log = edm::LogPrint("CUDAMonitoringService"); + log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" + << mcc.moduleDescription()->moduleName() << ")"; + dumpUsedMemory(log, numberOfDevices_); +} + +void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) { + auto log = edm::LogPrint("CUDAMonitoringService"); + log << "CUDA device memory after event"; + dumpUsedMemory(log, numberOfDevices_); +} + +DEFINE_FWK_SERVICE(CUDAMonitoringService); diff --git a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc index ec8c4deac4d4d..29fa1ab959025 100644 --- a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc +++ b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc @@ -41,6 +41,7 @@ #include "FWCore/Utilities/interface/Exception.h" #include "FWCore/Utilities/interface/ProductKindOfType.h" #include "FWCore/Utilities/interface/TimeOfDay.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" using namespace std::string_literals; @@ -285,9 +286,8 @@ class NVProfilerService { std::vector highlightModules_; const bool showModulePrefetching_; - bool skipFirstEvent_; + const bool skipFirstEvent_; - unsigned int concurrentStreams_; std::atomic globalFirstEventDone_ = false; std::vector> streamFirstEventDone_; std::vector event_; // per-stream event ranges @@ -295,49 +295,22 @@ class NVProfilerService { // use a tbb::concurrent_vector rather than an std::vector because its final size is not known tbb::concurrent_vector global_modules_; // global per-module events -private: - struct Domains { - nvtxDomainHandle_t global; - std::vector stream; - - Domains(NVProfilerService* service) { - global = nvtxDomainCreate("EDM Global"); - allocate_streams(service->concurrentStreams_); - } - - ~Domains() { - nvtxDomainDestroy(global); - for (unsigned int sid = 0; sid < stream.size(); ++sid) { - nvtxDomainDestroy(stream[sid]); - } - } - - void allocate_streams(unsigned int streams) { - stream.resize(streams); - for (unsigned int sid = 0; sid < streams; ++sid) { - stream[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str()); - } - } - }; - - // allow access to concurrentStreams_ - friend struct Domains; - - tbb::enumerable_thread_specific domains_; - - nvtxDomainHandle_t global_domain() { return domains_.local().global; } - - nvtxDomainHandle_t stream_domain(unsigned int sid) { return domains_.local().stream.at(sid); } + nvtxDomainHandle_t global_domain_; // NVTX domain for global EDM transitions + std::vector stream_domain_; // NVTX domains for per-EDM-stream transitions }; NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) : highlightModules_(config.getUntrackedParameter>("highlightModules")), showModulePrefetching_(config.getUntrackedParameter("showModulePrefetching")), - skipFirstEvent_(config.getUntrackedParameter("skipFirstEvent")), - concurrentStreams_(0), - domains_(this) { + skipFirstEvent_(config.getUntrackedParameter("skipFirstEvent")) { + // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor + edm::Service cudaService; + std::sort(highlightModules_.begin(), highlightModules_.end()); + // create the NVTX domain for global EDM transitions + global_domain_ = nvtxDomainCreate("EDM Global"); + // enables profile collection; if profiling is already enabled it has no effect if (not skipFirstEvent_) { cudaProfilerStart(); @@ -491,7 +464,13 @@ NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::Activ registry.watchPostEventReadFromSource(this, &NVProfilerService::postEventReadFromSource); } -NVProfilerService::~NVProfilerService() { cudaProfilerStop(); } +NVProfilerService::~NVProfilerService() { + for (unsigned int sid = 0; sid < stream_domain_.size(); ++sid) { + nvtxDomainDestroy(stream_domain_[sid]); + } + nvtxDomainDestroy(global_domain_); + cudaProfilerStop(); +} void NVProfilerService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) { edm::ParameterSetDescription desc; @@ -517,17 +496,20 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) { out << "preallocate: " << bounds.maxNumberOfConcurrentRuns() << " concurrent runs, " << bounds.maxNumberOfConcurrentLuminosityBlocks() << " luminosity sections, " << bounds.maxNumberOfStreams() << " streams\nrunning on" << bounds.maxNumberOfThreads() << " threads"; - nvtxDomainMark(global_domain(), out.str().c_str()); + nvtxDomainMark(global_domain_, out.str().c_str()); - concurrentStreams_ = bounds.maxNumberOfStreams(); - for (auto& domain : domains_) { - domain.allocate_streams(concurrentStreams_); + auto concurrentStreams = bounds.maxNumberOfStreams(); + // create the NVTX domains for per-EDM-stream transitions + stream_domain_.resize(concurrentStreams); + for (unsigned int sid = 0; sid < concurrentStreams; ++sid) { + stream_domain_[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str()); } - event_.resize(concurrentStreams_); - stream_modules_.resize(concurrentStreams_); + + event_.resize(concurrentStreams); + stream_modules_.resize(concurrentStreams); if (skipFirstEvent_) { globalFirstEventDone_ = false; - std::vector> tmp(concurrentStreams_); + std::vector> tmp(concurrentStreams); for (auto& element : tmp) std::atomic_init(&element, false); streamFirstEventDone_ = std::move(tmp); @@ -536,86 +518,86 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) { void NVProfilerService::preBeginJob(edm::PathsAndConsumesOfModulesBase const& pathsAndConsumes, edm::ProcessContext const& pc) { - nvtxDomainMark(global_domain(), "preBeginJob"); + nvtxDomainMark(global_domain_, "preBeginJob"); // FIXME this probably works only in the absence of subprocesses // size() + 1 because pathsAndConsumes.allModules() does not include the source unsigned int modules = pathsAndConsumes.allModules().size() + 1; global_modules_.resize(modules, nvtxInvalidRangeId); - for (unsigned int sid = 0; sid < concurrentStreams_; ++sid) { + for (unsigned int sid = 0; sid < stream_modules_.size(); ++sid) { stream_modules_[sid].resize(modules, nvtxInvalidRangeId); } } void NVProfilerService::postBeginJob() { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainMark(global_domain(), "postBeginJob"); + nvtxDomainMark(global_domain_, "postBeginJob"); } } void NVProfilerService::postEndJob() { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainMark(global_domain(), "postEndJob"); + nvtxDomainMark(global_domain_, "postEndJob"); } } void NVProfilerService::preSourceEvent(edm::StreamID sid) { if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "source"); + nvtxDomainRangePush(stream_domain_[sid], "source"); } } void NVProfilerService::postSourceEvent(edm::StreamID sid) { if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preSourceLumi(edm::LuminosityBlockIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "source lumi"); + nvtxDomainRangePush(global_domain_, "source lumi"); } } void NVProfilerService::postSourceLumi(edm::LuminosityBlockIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preSourceRun(edm::RunIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "source run"); + nvtxDomainRangePush(global_domain_, "source run"); } } void NVProfilerService::postSourceRun(edm::RunIndex index) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preOpenFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), ("open file "s + lfn).c_str()); + nvtxDomainRangePush(global_domain_, ("open file "s + lfn).c_str()); } } void NVProfilerService::postOpenFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preCloseFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), ("close file "s + lfn).c_str()); + nvtxDomainRangePush(global_domain_, ("close file "s + lfn).c_str()); } } void NVProfilerService::postCloseFile(std::string const& lfn, bool) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } @@ -626,7 +608,7 @@ void NVProfilerService::preModuleBeginStream(edm::StreamContext const& sc, edm:: auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " begin stream"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -634,7 +616,7 @@ void NVProfilerService::postModuleBeginStream(edm::StreamContext const& sc, edm: auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -646,7 +628,7 @@ void NVProfilerService::preModuleEndStream(edm::StreamContext const& sc, edm::Mo auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " end stream"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -654,124 +636,124 @@ void NVProfilerService::postModuleEndStream(edm::StreamContext const& sc, edm::M auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } void NVProfilerService::preGlobalBeginRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global begin run"); + nvtxDomainRangePush(global_domain_, "global begin run"); } } void NVProfilerService::postGlobalBeginRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preGlobalEndRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global end run"); + nvtxDomainRangePush(global_domain_, "global end run"); } } void NVProfilerService::postGlobalEndRun(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preStreamBeginRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "stream begin run"); + nvtxDomainRangePush(stream_domain_[sid], "stream begin run"); } } void NVProfilerService::postStreamBeginRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preStreamEndRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "stream end run"); + nvtxDomainRangePush(stream_domain_[sid], "stream end run"); } } void NVProfilerService::postStreamEndRun(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preGlobalBeginLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global begin lumi"); + nvtxDomainRangePush(global_domain_, "global begin lumi"); } } void NVProfilerService::postGlobalBeginLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preGlobalEndLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePush(global_domain(), "global end lumi"); + nvtxDomainRangePush(global_domain_, "global end lumi"); } } void NVProfilerService::postGlobalEndLumi(edm::GlobalContext const& gc) { if (not skipFirstEvent_ or globalFirstEventDone_) { - nvtxDomainRangePop(global_domain()); + nvtxDomainRangePop(global_domain_); } } void NVProfilerService::preStreamBeginLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePush(stream_domain(sid), "stream begin lumi"); + nvtxDomainRangePush(stream_domain_[sid], "stream begin lumi"); } } void NVProfilerService::postStreamBeginLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preStreamEndLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); - nvtxDomainRangePush(stream_domain(sid), "stream end lumi"); + nvtxDomainRangePush(stream_domain_[sid], "stream end lumi"); } void NVProfilerService::postStreamEndLumi(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangePop(stream_domain(sid)); + nvtxDomainRangePop(stream_domain_[sid]); } } void NVProfilerService::preEvent(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - event_[sid] = nvtxDomainRangeStartColor(stream_domain(sid), "event", nvtxDarkGreen); + event_[sid] = nvtxDomainRangeStartColor(stream_domain_[sid], "event", nvtxDarkGreen); } } void NVProfilerService::postEvent(edm::StreamContext const& sc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainRangeEnd(stream_domain(sid), event_[sid]); + nvtxDomainRangeEnd(stream_domain_[sid], event_[sid]); event_[sid] = nvtxInvalidRangeId; } else { streamFirstEventDone_[sid] = true; @@ -787,7 +769,7 @@ void NVProfilerService::postEvent(edm::StreamContext const& sc) { void NVProfilerService::prePathEvent(edm::StreamContext const& sc, edm::PathContext const& pc) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainMark(global_domain(), ("before path "s + pc.pathName()).c_str()); + nvtxDomainMark(global_domain_, ("before path "s + pc.pathName()).c_str()); } } @@ -796,7 +778,7 @@ void NVProfilerService::postPathEvent(edm::StreamContext const& sc, edm::HLTPathStatus const& hlts) { auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { - nvtxDomainMark(global_domain(), ("after path "s + pc.pathName()).c_str()); + nvtxDomainMark(global_domain_, ("after path "s + pc.pathName()).c_str()); } } @@ -807,7 +789,7 @@ void NVProfilerService::preModuleEventPrefetching(edm::StreamContext const& sc, auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " prefetching"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label)); } } @@ -815,7 +797,7 @@ void NVProfilerService::postModuleEventPrefetching(edm::StreamContext const& sc, auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -826,14 +808,14 @@ void NVProfilerService::preModuleConstruction(edm::ModuleDescription const& desc global_modules_.grow_to_at_least(mid + 1); auto const& label = desc.moduleLabel(); auto const& msg = label + " construction"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleConstruction(edm::ModuleDescription const& desc) { if (not skipFirstEvent_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -843,14 +825,14 @@ void NVProfilerService::preModuleBeginJob(edm::ModuleDescription const& desc) { auto mid = desc.id(); auto const& label = desc.moduleLabel(); auto const& msg = label + " begin job"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleBeginJob(edm::ModuleDescription const& desc) { if (not skipFirstEvent_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -860,14 +842,14 @@ void NVProfilerService::preModuleEndJob(edm::ModuleDescription const& desc) { auto mid = desc.id(); auto const& label = desc.moduleLabel(); auto const& msg = label + " end job"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleEndJob(edm::ModuleDescription const& desc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -879,7 +861,7 @@ void NVProfilerService::preModuleEventAcquire(edm::StreamContext const& sc, edm: auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " acquire"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -887,7 +869,7 @@ void NVProfilerService::postModuleEventAcquire(edm::StreamContext const& sc, edm auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -898,7 +880,7 @@ void NVProfilerService::preModuleEvent(edm::StreamContext const& sc, edm::Module auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColor(label)); } } @@ -906,7 +888,7 @@ void NVProfilerService::postModuleEvent(edm::StreamContext const& sc, edm::Modul auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -919,7 +901,7 @@ void NVProfilerService::preModuleEventDelayedGet(edm::StreamContext const& sc, e auto const & label = mcc.moduleDescription()->moduleLabel(); auto const & msg = label + " delayed get"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColorLight(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColorLight(label)); } */ } @@ -929,7 +911,7 @@ void NVProfilerService::postModuleEventDelayedGet(edm::StreamContext const& sc, auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } */ @@ -943,7 +925,7 @@ void NVProfilerService::preEventReadFromSource(edm::StreamContext const& sc, edm auto const & label = mcc.moduleDescription()->moduleLabel(); auto const & msg = label + " read from source"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label)); } */ } @@ -953,7 +935,7 @@ void NVProfilerService::postEventReadFromSource(edm::StreamContext const& sc, ed auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } */ @@ -966,7 +948,7 @@ void NVProfilerService::preModuleStreamBeginRun(edm::StreamContext const& sc, ed auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream begin run"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -974,7 +956,7 @@ void NVProfilerService::postModuleStreamBeginRun(edm::StreamContext const& sc, e auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -986,7 +968,7 @@ void NVProfilerService::preModuleStreamEndRun(edm::StreamContext const& sc, edm: auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream end run"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -994,7 +976,7 @@ void NVProfilerService::postModuleStreamEndRun(edm::StreamContext const& sc, edm auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -1006,7 +988,7 @@ void NVProfilerService::preModuleStreamBeginLumi(edm::StreamContext const& sc, e auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream begin lumi"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -1014,7 +996,7 @@ void NVProfilerService::postModuleStreamBeginLumi(edm::StreamContext const& sc, auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -1026,7 +1008,7 @@ void NVProfilerService::preModuleStreamEndLumi(edm::StreamContext const& sc, edm auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " stream end lumi"; assert(stream_modules_[sid][mid] == nvtxInvalidRangeId); - stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label)); + stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label)); } } @@ -1034,7 +1016,7 @@ void NVProfilerService::postModuleStreamEndLumi(edm::StreamContext const& sc, ed auto sid = sc.streamID(); if (not skipFirstEvent_ or streamFirstEventDone_[sid]) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]); + nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]); stream_modules_[sid][mid] = nvtxInvalidRangeId; } } @@ -1044,14 +1026,14 @@ void NVProfilerService::preModuleGlobalBeginRun(edm::GlobalContext const& gc, ed auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global begin run"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalBeginRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1061,14 +1043,14 @@ void NVProfilerService::preModuleGlobalEndRun(edm::GlobalContext const& gc, edm: auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global end run"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalEndRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1078,14 +1060,14 @@ void NVProfilerService::preModuleGlobalBeginLumi(edm::GlobalContext const& gc, e auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global begin lumi"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalBeginLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1095,14 +1077,14 @@ void NVProfilerService::preModuleGlobalEndLumi(edm::GlobalContext const& gc, edm auto mid = mcc.moduleDescription()->id(); auto const& label = mcc.moduleDescription()->moduleLabel(); auto const& msg = label + " global end lumi"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postModuleGlobalEndLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) { if (not skipFirstEvent_ or globalFirstEventDone_) { auto mid = mcc.moduleDescription()->id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } @@ -1113,14 +1095,14 @@ void NVProfilerService::preSourceConstruction(edm::ModuleDescription const& desc global_modules_.grow_to_at_least(mid + 1); auto const& label = desc.moduleLabel(); auto const& msg = label + " construction"; - global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label)); + global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label)); } } void NVProfilerService::postSourceConstruction(edm::ModuleDescription const& desc) { if (not skipFirstEvent_) { auto mid = desc.id(); - nvtxDomainRangeEnd(global_domain(), global_modules_[mid]); + nvtxDomainRangeEnd(global_domain_, global_modules_[mid]); global_modules_[mid] = nvtxInvalidRangeId; } } diff --git a/HeterogeneousCore/CUDAServices/plugins/plugins.cc b/HeterogeneousCore/CUDAServices/plugins/plugins.cc new file mode 100644 index 0000000000000..d8aefa42e9c99 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/plugins/plugins.cc @@ -0,0 +1,4 @@ +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" + +DEFINE_FWK_SERVICE(CUDAService); diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh new file mode 100644 index 0000000000000..bde3e26382976 --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh @@ -0,0 +1,10 @@ +#! /bin/bash -e + +# move to the .../src directory +cd $CMSSW_BASE/src/ + +# check out all packages containing .cu files +git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg + +# rebuild all checked out packages +scram b -j diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh new file mode 100644 index 0000000000000..f3335f4cd409f --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh @@ -0,0 +1,19 @@ +#! /bin/bash +TOOL=$CMSSW_BASE/config/toolbox/$SCRAM_ARCH/tools/selected/cuda.xml + +# enumerate the supported streaming multiprocessor (sm) compute capabilites +DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u) +CAPS=$(echo $DOTS | sed -e's#\.*##g') + +# remove existing capabilities +sed -i $TOOL -e'\##d' + +# add support for the capabilities found on this machine +for CAP in $CAPS; do + sed -i $TOOL -e"\##a\ " +done + +# reconfigure the cuda.xml tool +scram setup cuda + +echo "SCRAM configured to support CUDA streaming multiprocessor architectures $DOTS" diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py new file mode 100644 index 0000000000000..331ddd30f73bd --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +from __future__ import print_function +import re +import sys +import argparse + +def main(opts): + device = [] + host = [] + + device_re = re.compile("Device.*allocated new device block.*\((?P\d+) bytes") + host_re = re.compile("Host.*allocated new host block.*\((?P\d+) bytes") + + f = open(opts.file) + for line in f: + m = device_re.search(line) + if m: + device.append(m.group("bytes")) + continue + m = host_re.search(line) + if m: + host.append(m.group("bytes")) + f.close() + + print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device)) + print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host)) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file. + +To use, run the job once with "process.CUDAService.allocator.debug = +True" and direct the output to a file. Then run this script by passing +the file as an argument, and copy the output of this script back to +the configuration file.""") + parser.add_argument("file", type=str, help="Log file to parse") + opts = parser.parse_args() + main(opts) diff --git a/HeterogeneousCore/CUDAServices/scripts/nvprof-remote b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote new file mode 100644 index 0000000000000..3b010c005291f --- /dev/null +++ b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote @@ -0,0 +1,23 @@ +#! /bin/bash + +# find the CMSSW release +if [ -z "$CMSSW_BASE" ]; then + export CMSSW_BASE=$(readlink -f $(dirname $0)/../..) +fi + +# load the CMS environment +source $(< "$CMSSW_BASE"/config/scram_basedir)/cmsset_default.sh + +# load the CMSSW release environment +eval `cd "$CMSSW_BASE"; scram runtime -sh 2> /dev/null` + +# log the commands being run +{ + date + echo "cwd: $PWD" + echo "cmd: $0 $@" + echo +} >> $CMSSW_BASE/tmp/nvprof.log + +# run the CUDA profiler +nvprof "$@" diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc new file mode 100644 index 0000000000000..1568e5bb508eb --- /dev/null +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -0,0 +1,387 @@ +#include +#include +#include + +#include + +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/ReusableObjectHolder.h" +#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" +#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h" + +void setCudaLimit(cudaLimit limit, const char* name, size_t request) { + // read the current device + int device; + cudaCheck(cudaGetDevice(&device)); + // try to set the requested limit + auto result = cudaDeviceSetLimit(limit, request); + if (cudaErrorUnsupportedLimit == result) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\""; + return; + } + // read back the limit value + size_t value; + cudaCheck(cudaDeviceGetLimit(&value, limit)); + if (cudaSuccess != result) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to " + << request << ", current value is " << value; + } else if (value != request) { + edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value + << " instead of requested " << request; + } +} + +constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) { + switch (major * 10 + minor) { + // Fermi architecture + case 20: // SM 2.0: GF100 class + return 32; + case 21: // SM 2.1: GF10x class + return 48; + + // Kepler architecture + case 30: // SM 3.0: GK10x class + case 32: // SM 3.2: GK10x class + case 35: // SM 3.5: GK11x class + case 37: // SM 3.7: GK21x class + return 192; + + // Maxwell architecture + case 50: // SM 5.0: GM10x class + case 52: // SM 5.2: GM20x class + case 53: // SM 5.3: GM20x class + return 128; + + // Pascal architecture + case 60: // SM 6.0: GP100 class + return 64; + case 61: // SM 6.1: GP10x class + case 62: // SM 6.2: GP10x class + return 128; + + // Volta architecture + case 70: // SM 7.0: GV100 class + case 72: // SM 7.2: GV11b class + return 64; + + // Turing architecture + case 75: // SM 7.5: TU10x class + return 64; + + // unknown architecture, return a default value + default: + return 64; + } +} + +namespace { + template