diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
new file mode 100644
index 0000000000000..98033aab4d99d
--- /dev/null
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,5 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
new file mode 100644
index 0000000000000..75c9c80e7f206
--- /dev/null
+++ b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -0,0 +1,55 @@
+#ifndef CUDADataFormats_Common_CUDAProduct_h
+#define CUDADataFormats_Common_CUDAProduct_h
+
+#include <memory>
+
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
+
+namespace edm {
+  template <typename T>
+  class Wrapper;
+}
+namespace impl {
+  class CUDAScopedContextGetterBase;
+}
+
+/**
+ * The purpose of this class is to wrap CUDA data to edm::Event in a
+ * way which forces correct use of various utilities.
+ *
+ * The non-default construction has to be done with CUDAScopedContext
+ * (in order to properly register the CUDA event).
+ *
+ * The default constructor is needed only for the ROOT dictionary generation.
+ *
+ * The CUDA event is in practice needed only for stream-stream
+ * synchronization, but someone with long-enough lifetime has to own
+ * it. Here is a somewhat natural place. If overhead is too much, we
+ * can use them only where synchronization between streams is needed.
+ */
+template <typename T>
+class CUDAProduct : public CUDAProductBase {
+public:
+  CUDAProduct() = default;  // Needed only for ROOT dictionary generation
+
+  CUDAProduct(const CUDAProduct&) = delete;
+  CUDAProduct& operator=(const CUDAProduct&) = delete;
+  CUDAProduct(CUDAProduct&&) = default;
+  CUDAProduct& operator=(CUDAProduct&&) = default;
+
+private:
+  friend class impl::CUDAScopedContextGetterBase;
+  friend class CUDAScopedContextProduce;
+  friend class edm::Wrapper<CUDAProduct<T>>;
+
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, T data)
+      : CUDAProductBase(device, std::move(stream)), data_(std::move(data)) {}
+
+  template <typename... Args>
+  explicit CUDAProduct(int device, cudautils::SharedStreamPtr stream, Args&&... args)
+      : CUDAProductBase(device, std::move(stream)), data_(std::forward<Args>(args)...) {}
+
+  T data_;  //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
new file mode 100644
index 0000000000000..219b7e619de7f
--- /dev/null
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -0,0 +1,90 @@
+#ifndef CUDADataFormats_Common_CUDAProductBase_h
+#define CUDADataFormats_Common_CUDAProductBase_h
+
+#include <atomic>
+#include <memory>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+namespace impl {
+  class CUDAScopedContextBase;
+}
+
+/**
+ * Base class for all instantiations of CUDA<T> to hold the
+ * non-T-dependent members.
+ */
+class CUDAProductBase {
+public:
+  CUDAProductBase() = default;  // Needed only for ROOT dictionary generation
+  ~CUDAProductBase();
+
+  CUDAProductBase(const CUDAProductBase&) = delete;
+  CUDAProductBase& operator=(const CUDAProductBase&) = delete;
+  CUDAProductBase(CUDAProductBase&& other)
+      : stream_{std::move(other.stream_)},
+        event_{std::move(other.event_)},
+        mayReuseStream_{other.mayReuseStream_.load()},
+        device_{other.device_} {}
+  CUDAProductBase& operator=(CUDAProductBase&& other) {
+    stream_ = std::move(other.stream_);
+    event_ = std::move(other.event_);
+    mayReuseStream_ = other.mayReuseStream_.load();
+    device_ = other.device_;
+    return *this;
+  }
+
+  bool isValid() const { return stream_.get() != nullptr; }
+  bool isAvailable() const;
+
+  int device() const { return device_; }
+
+  // cudaStream_t is a pointer to a thread-safe object, for which a
+  // mutable access is needed even if the CUDAScopedContext itself
+  // would be const. Therefore it is ok to return a non-const
+  // pointer from a const method here.
+  cudaStream_t stream() const { return stream_.get(); }
+
+  // cudaEvent_t is a pointer to a thread-safe object, for which a
+  // mutable access is needed even if the CUDAScopedContext itself
+  // would be const. Therefore it is ok to return a non-const
+  // pointer from a const method here.
+  cudaEvent_t event() const { return event_ ? event_.get() : nullptr; }
+
+protected:
+  explicit CUDAProductBase(int device, cudautils::SharedStreamPtr stream)
+      : stream_{std::move(stream)}, device_{device} {}
+
+private:
+  friend class impl::CUDAScopedContextBase;
+  friend class CUDAScopedContextProduce;
+
+  // The following functions are intended to be used only from CUDAScopedContext
+  void setEvent(cudautils::SharedEventPtr event) { event_ = std::move(event); }
+  const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+
+  bool mayReuseStream() const {
+    bool expected = true;
+    bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
+    // If the current thread is the one flipping the flag, it may
+    // reuse the stream.
+    return changed;
+  }
+
+  // The cudaStream_t is really shared among edm::Event products, so
+  // using shared_ptr also here
+  cudautils::SharedStreamPtr stream_;  //!
+  // shared_ptr because of caching in CUDAEventCache
+  cudautils::SharedEventPtr event_;  //!
+
+  // This flag tells whether the CUDA stream may be reused by a
+  // consumer or not. The goal is to have a "chain" of modules to
+  // queue their work to the same stream.
+  mutable std::atomic<bool> mayReuseStream_ = true;  //!
+
+  // The CUDA device associated with this product
+  int device_ = -1;  //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
new file mode 100644
index 0000000000000..72302d3165676
--- /dev/null
+++ b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -0,0 +1,27 @@
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
+
+bool CUDAProductBase::isAvailable() const {
+  // In absence of event, the product was available already at the end
+  // of produce() of the producer.
+  if (not event_) {
+    return true;
+  }
+  return cudautils::eventIsOccurred(event_.get());
+}
+
+CUDAProductBase::~CUDAProductBase() {
+  // Make sure that the production of the product in the GPU is
+  // complete before destructing the product. This is to make sure
+  // that the EDM stream does not move to the next event before all
+  // asynchronous processing of the current is complete.
+  if (event_) {
+    // TODO: a callback notifying a WaitingTaskHolder (or similar)
+    // would avoid blocking the CPU, but would also require more work.
+    //
+    // Intentionally not checking the return value to avoid throwing
+    // exceptions. If this call would fail, we should get failures
+    // elsewhere as well.
+    cudaEventSynchronize(event_.get());
+  }
+}
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
new file mode 100644
index 0000000000000..5e804fe80a736
--- /dev/null
+++ b/CUDADataFormats/Common/test/BuildFile.xml
@@ -0,0 +1,5 @@
+<bin file="test*.cc" name="testCUDADataFormatsCommon">
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
new file mode 100644
index 0000000000000..3eb3115571813
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -0,0 +1,68 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+
+#include <cuda_runtime_api.h>
+
+namespace cudatest {
+  class TestCUDAScopedContext {
+  public:
+    static CUDAScopedContextProduce make(int dev, bool createEvent) {
+      cudautils::SharedEventPtr event;
+      if (createEvent) {
+        event = cudautils::getCUDAEventCache().getCUDAEvent();
+      }
+      return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
+    }
+  };
+}  // namespace cudatest
+
+TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
+  SECTION("Default constructed") {
+    auto foo = CUDAProduct<int>();
+    REQUIRE(!foo.isValid());
+
+    auto bar = std::move(foo);
+  }
+
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  cudaCheck(cudaSetDevice(defaultDevice));
+  {
+    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true);
+    std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+    auto& data = *dataPtr;
+
+    SECTION("Construct from CUDAScopedContext") {
+      REQUIRE(data.isValid());
+      REQUIRE(data.device() == defaultDevice);
+      REQUIRE(data.stream() == ctx.stream());
+      REQUIRE(data.event() != nullptr);
+    }
+
+    SECTION("Move constructor") {
+      auto data2 = CUDAProduct<int>(std::move(data));
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+
+    SECTION("Move assignment") {
+      CUDAProduct<int> data2;
+      data2 = std::move(data);
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+  }
+
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
index efc7d9e6cde0b..44f7b1ca14944 100644
--- a/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
+++ b/FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h
@@ -24,9 +24,9 @@
 
 #include "tbb/task_arena.h"
 
-namespace edm {
+#include "FWCore/Concurrency/interface/WaitingTask.h"
 
-  class WaitingTask;
+namespace edm {
   class WaitingTaskHolder;
 
   class WaitingTaskWithArenaHolder {
@@ -72,5 +72,29 @@ namespace edm {
     WaitingTask* m_task;
     std::shared_ptr<tbb::task_arena> m_arena;
   };
+
+  template <typename F>
+  auto make_lambda_with_holder(WaitingTaskWithArenaHolder h, F&& f) {
+    return [holder = std::move(h), func = std::forward<F>(f)]() mutable {
+      try {
+        func(holder);
+      } catch (...) {
+        holder.doneWaiting(std::current_exception());
+      }
+    };
+  }
+
+  template <typename ALLOC, typename F>
+  auto make_waiting_task_with_holder(ALLOC&& iAlloc, WaitingTaskWithArenaHolder h, F&& f) {
+    return make_waiting_task(
+        std::forward<ALLOC>(iAlloc),
+        [holder = h, func = make_lambda_with_holder(h, std::forward<F>(f))](std::exception_ptr const* excptr) mutable {
+          if (excptr) {
+            holder.doneWaiting(*excptr);
+            return;
+          }
+          func();
+        });
+  }
 }  // namespace edm
 #endif
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
new file mode 100644
index 0000000000000..d78c8a28f0470
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -0,0 +1,12 @@
+<use name="FWCore/Concurrency"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSet"/>
+<use name="CUDADataFormats/Common"/>
+<use name="DataFormats/Provenance"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
new file mode 100644
index 0000000000000..3948ae7e59f79
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -0,0 +1,1003 @@
+# CUDA algorithms in CMSSW
+
+## Outline
+
+* [Introduction](#introduction)
+  * [Design goals](#design-goals)
+  * [Overall guidelines](#overall-guidelines)
+* [Sub-packages](#sub-packages)
+* [Examples](#examples)
+  * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output)
+  * [Producer with CUDA output](#producer-with-cuda-output)
+  * [Producer with CUDA input](#producer-with-cuda-input)
+  * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork)
+  * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork)
+  * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork)
+  * [Analyzer with CUDA input](#analyzer-with-cuda-input)
+  * [Configuration](#configuration)
+    * [GPU-only configuration](#gpu-only-configuration)
+    * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules)
+* [More details](#more-details)
+  * [Device choice](#device-choice)
+  * [Data model](#data-model)
+  * [CUDA EDProducer](#cuda-edproducer)
+    * [Class declaration](#class-declaration)
+    * [Memory allocation](#memory-allocation)
+      * [Caching allocator](#caching-allocator)
+      * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr)
+      * [CUDA API](#cuda-api)
+    * [Setting the current device](#setting-the-current-device)
+    * [Getting input](#getting-input)
+    * [Calling the CUDA kernels](#calling-the-cuda-kernels)
+    * [Putting output](#putting-output)
+    * [`ExternalWork` extension](#externalwork-extension)
+    * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks)
+    * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu)
+    * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams)
+  * [CUDA ESProduct](#cuda-esproduct)
+
+## Introduction
+
+This page documents the CUDA integration within CMSSW
+
+### Design goals
+
+1. Provide a mechanism for a chain of modules to share a resource
+   * Resource can be e.g. CUDA device memory or a CUDA stream
+2. Minimize data movements between the CPU and the device
+3. Support multiple devices
+4. Allow the same job configuration to be used on all hardware combinations
+
+### Overall guidelines
+
+1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
+   * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
+   * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
+     * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator))
+   * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
+     * With the latter the `assert()` calls in CUDA code are disabled by
+       default, but can be enabled by defining a `GPU_DEBUG` macro
+       (before the aforementioned include)
+2. Synchronization needs should be fulfilled with
+   [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
+   extension to EDProducers
+   * `ExternalWork` can be used to replace one synchronization point
+     (e.g. between device kernels and copying a known amount of data
+     back to CPU).
+   * For further synchronization points (e.g. copying data whose
+     amount is known only at the device side), split the work to
+     multiple `ExternalWork` producers. This approach has the added
+     benefit that e.g. data transfers to CPU become on-demand automatically
+   * A general breakdown of the possible steps:
+     * Convert input legacy CPU data format to CPU SoA
+     * Transfer input CPU SoA to GPU
+     * Launch kernels
+     * Transfer the number of output elements to CPU
+     * Transfer the output data from GPU to CPU SoA
+     * Convert the output SoA to legacy CPU data formats
+3. Within `acquire()`/`produce()`, the current CUDA device is set
+   implicitly and the CUDA stream is provided by the system (with
+   `CUDAScopedContextAcquire`/`CUDAScopedContextProduce`)
+   * It is strongly recommended to use the provided CUDA stream for all operations
+     * If that is not feasible for some reason, the provided CUDA
+       stream must synchronize with the work queued on other CUDA
+       streams (with CUDA events and `cudaStreamWaitEvent()`)
+4. Outside of `acquire()`/`produce()`, CUDA API functions may be
+   called only if `CUDAService::enabled()` returns `true`.
+   * With point 3 it follows that in these cases multiple devices have
+     to be dealt with explicitly, as well as CUDA streams
+
+## Sub-packages
+* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
+* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
+* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
+* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
+* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
+
+## Examples
+
+### Isolated producer (no CUDA input nor output)
+
+```cpp
+class IsolatedProducerCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  IsolatedProducerGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContextAcquire::stream()
+  gpuAlgo_.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+### Producer with CUDA output
+
+```cpp
+class ProducerOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  ProducerOutputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  CUDAContextState ctxState_;
+};
+...
+void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  CUDAScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work has finished
+void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  CUDAScopedContextProduce ctx{ctxState_};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
+  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+### Producer with CUDA input
+
+```cpp
+class ProducerInputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDGetTokenT<CUDAProduct<OtherInputData>> otherInputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and possibly use the same CUDA stream
+  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContextAcquire holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Input data from another producer
+  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+  // or
+  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+See [further below](#setting-the-current-device) for the conditions
+when the `CUDAScopedContextAcquire` constructor reuses the CUDA stream. Note
+that the `CUDAScopedContextAcquire` constructor taking `edm::StreamID` is
+allowed, it will just always create a new CUDA stream.
+
+
+### Producer with CUDA input and output (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContextAcquire holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  CUDAScopedContextProduce ctx{ctxState_};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
+  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc)
+
+
+### Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  void addMoreWork(edm::WaitingTaskWithArenaHolder waitingTashHolder);
+
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContextAcquire holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContextAcquire::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Push a functor on top of "a stack of tasks" to be run as a next
+  // task after the work queued above before produce(). In this case ctx
+  // is a context constructed by the calling TBB task, and therefore the
+  // current device and CUDA stream have been already set up. The ctx
+  // internally holds the WaitingTaskWithArenaHolder for the next task.
+
+  ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+    addMoreWork(ctx);
+  });
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished,
+  // and saves the device and CUDA stream to ctxState_
+}
+
+// Called after the asynchronous work queued in acquire() has finished
+void ProducerInputOutputCUDA::addMoreWork(CUDAScopedContextTask& ctx) {
+  // Current device and CUDA stream have already been set
+
+  // Queues more asynchronous data transfer and kernels to the CUDA
+  // stream returned by CUDAScopedContextTask::stream()
+  gpuAlgo.makeMoreAsync(ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work queued in addMoreWork() has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  CUDAScopedContextProduce ctx{ctxState_};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. CUDAScopedContextProduce::emplace() wraps the
+  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEWTask.cc)
+
+
+### Producer with CUDA input and output (without ExternalWork)
+
+If the producer does not need to transfer anything back to CPU (like
+the number of output elements), the `ExternalWork` extension is not
+needed as there is no need to synchronize.
+
+```cpp
+class ProducerInputOutputCUDA: public edm::global::EDProducer<> {
+public:
+  ...
+  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and possibly use the same CUDA stream
+  CUDAScopedContextProduce ctx{inputDataWrapped};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContextProduce holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContextProduce::stream(). Here makeAsync() also
+  // returns data in GPU memory that is passed to the constructor of
+  // OutputData. CUDAScopedContextProduce::emplace() wraps the OutputData to
+  // CUDAProduct<OutputData>. CUDAProduct<T> stores also the current
+  // device and the CUDA stream since those will be needed in the
+  // consumer side.
+  ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc)
+
+
+### Analyzer with CUDA input
+
+Analyzer with CUDA input is similar to [producer with CUDA
+input](#producer-with-cuda-input). Note that currently we do not have
+a mechanism for portable configurations with analyzers (like
+[`SwitchProducer`](#automatic-switching-between-cpu-and-gpu-modules)
+for producers). This means that a configuration with a CUDA analyzer
+can only run on a machine with CUDA device(s).
+
+```cpp
+class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> {
+public:
+  ...
+  void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override;
+  ...
+private:
+  ...
+  AnalyzerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDGetTokenT<CUDAProduct<OtherInputData>> otherInputToken_;
+};
+...
+void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and possibly use the same CUDA stream
+  CUDAScopedContextAnalyze ctx{inputDataWrapped};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContextAnalyze holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Input data from another producer
+  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+  // or
+  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContextAnalyze::stream()
+  gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream());
+}
+```
+
+[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc)
+
+
+### Configuration
+
+#### GPU-only configuration
+
+For a GPU-only configuration there is nothing special to be done, just
+construct the Paths/Sequences/Tasks from the GPU modules.
+
+#### Automatic switching between CPU and GPU modules
+
+The `SwitchProducer` mechanism can be used to switch automatically
+between CPU and GPU modules based on the availability of GPUs on the
+machine where the configuration is done. Framework decides at the
+beginning of the job which of the modules to run for a given module
+label.
+
+Framework requires that the modules in the switch must produce the
+same types of output products (the closer the actual results are the
+better, but the framework can not enforce that). This means that for a
+chain of GPU modules, it is the module that transforms the SoA data
+format back to the legacy data formats (possibly, but not necessarily,
+transferring the SoA data from GPU to CPU) that should be switched
+between the legacy CPU module. The rest of the GPU modules should be
+placed to a `Task`, in which case framework runs them only if their
+output is needed by another module.
+
+```python
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+process.foo = SwitchProducerCUDA(
+    cpu = cms.EDProducer("FooProducer"), # legacy CPU
+    cuda = cms.EDProducer("FooProducerFromCUDA",
+        src="fooCUDA"
+    )
+)
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
+
+process.fooTaskCUDA = cms.Task(process.fooCUDA)
+process.fooTask = cms.Task(
+    process.foo,
+    process.fooTaskCUDA
+)
+```
+
+For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py).
+
+
+
+
+
+## More details
+
+### Device choice
+
+As discussed above, with `SwitchProducer` the choice between CPU and
+GPU modules is done at the beginning of the job.
+
+For multi-GPU setup the device is chosen in the first CUDA module in a
+chain of modules by one of the constructors of
+`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`
+```cpp
+// In ExternalWork acquire()
+CUDAScopedContextAcquire ctx{iEvent.streamID(), ...};
+
+// In normal produce() (or filter())
+CUDAScopedContextProduce ctx{iEvent.streamID()};
+```
+As the choice is still the static EDM stream to device assignment, the
+EDM stream ID is needed. The logic will likely evolve in the future to
+be more dynamic, and likely the device choice has to be made for the
+full event.
+
+### Data model
+
+The "GPU data product" should be a class/struct containing smart
+pointer(s) to device data (see [Memory allocation](#memory-allocation)).
+When putting the data to event, the data is wrapped to
+`CUDAProduct<T>` template, which holds
+* the GPU data product
+  * must be moveable, but no other restrictions
+* the current device where the data was produced, and the CUDA stream the data was produced with
+* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
+
+Note that the `CUDAProduct<T>` wrapper can be constructed only with
+`CUDAScopedContextProduce::wrap()`, and the data `T` can be obtained
+from it only with
+`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`,
+as described further below. When putting the data product directly to
+`edm::Event`, also `CUDASCopedContextProduce::emplace()` can be used.
+
+The GPU data products that depend on the CUDA runtime should be placed
+under `CUDADataFormats` package, using the same name for sub-package
+that would be used in `DataFormats`. Everything else, e.g. SoA for
+CPU, should go under `DataFormats` as usual.
+
+
+### CUDA EDProducer
+
+#### Class declaration
+
+The CUDA producers are normal EDProducers. The `ExternalWork`
+extension should be used if a synchronization between the GPU and CPU
+is needed, e.g. when transferring data from GPU to CPU.
+
+#### Memory allocation
+
+##### Caching allocator
+
+The memory allocations should be done dynamically with the following functions
+```cpp
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+cudautils::device::unique_ptr<float[]> device_buffer = cudautils::make_device_unique<float[]>(50, cudaStream);
+cudautils::host::unique_ptr<float[]>   host_buffer   = cudautils::make_host_unique<float[]>(50, cudaStream);
+```
+
+in the `acquire()` and `produce()` functions. The same
+`cudaStream_t` object that is used for transfers and kernels
+should be passed to the allocator.
+
+The allocator is based on `cub::CachingDeviceAllocator`. The memory is
+guaranteed to be reserved
+* for the host: up to the destructor of the `unique_ptr`
+* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
+
+##### Non-cached pinned host `unique_ptr`
+
+In producers transferring data to GPU one may want to pinned host
+memory allocated with `cudaHostAllocWriteCombined`. As of now we don't
+want to include the flag dimension to the caching allocator. The CUDA
+API wrapper library does not support allocation flags, so we add our
+own `unique_ptr` for that.
+
+```cpp
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+cudautils::host::noncached_unique_ptr<float[]> host_buffer = cudautils::make_host_noncached_unique<float[]>(50, flags);
+```
+The `flags` is passed directly to `cudaHostAlloc()`.
+
+##### CUDA API
+
+The `cudaMalloc()` etc may be used outside of the event loop, but that
+should be limited to only relatively small allocations in order to
+allow as much re-use of device memory as possible.
+
+If really needed, the `cudaMalloc()` etc may be used also within the
+event loop, but then the cost of allocation and implicit
+synchronization should be explicitly amortized e.g. by caching.
+
+#### Setting the current device
+
+A CUDA producer should construct `CUDAScopedContextAcquire` in
+`acquire()` (`CUDAScopedContextProduce` `produce()` if not using
+`ExternalWork`) either with `edm::StreamID`, or with a
+`CUDAProduct<T>` read as an input.
+
+```cpp
+// From edm::StreamID
+CUDAScopedContextAcquire ctx{iEvent.streamID(), ...};
+// or
+CUDAScopedContextProduce ctx{iEvent.streamID()};
+
+
+// From CUDAProduct<T>
+CUDAProduct<GPUClusters> const& cclus = iEvent.get(srcToken_);
+CUDAScopedContextAcquire ctx{cclus, ...};
+// or
+CUDAScopedContextProduce ctx{cclus};
+```
+
+A CUDA analyzer should construct `CUDAScopedContextAnalyze` with a
+`CUDAProduct<T>` read as an input.
+
+```cpp
+CUDAProduct<GPUClusters> const& cclus = iEvent.get(srcToken_);
+CUDAScopedContextAnalyze ctx{cclus};
+```
+
+`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze` work in the RAII way and does the following
+* Sets the current device for the current scope
+  - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
+  - If constructed from the `CUDAProduct<T>`, uses the same device and possibly the same CUDA stream as was used to produce the `CUDAProduct<T>`
+    * The CUDA stream is reused if this producer is the first consumer
+      of the `CUDAProduct<T>`, otherwise a new CUDA stream is created.
+      This approach is simple compromise to automatically express the work of
+      parallel producers in different CUDA streams, and at the same
+      time allow a chain of producers to queue their work to the same
+      CUDA stream.
+* Gives access to the CUDA stream the algorithm should use to queue asynchronous work
+* `CUDAScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor)
+* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
+* Needed to get `CUDAProduct<T>` from the event
+  * `CUDAScopedContextProduce` is needed to put `CUDAProduct<T>` to the event
+
+In case of multiple input products, from possibly different CUDA
+streams and/or CUDA devices, this approach gives the developer full
+control in which of them the kernels of the algorithm should be run.
+
+#### Getting input
+
+The real product (`T`) can be obtained from `CUDAProduct<T>` only with
+the help of
+`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`.
+
+```cpp
+// From CUDAProduct<T>
+CUDAProduct<GPUClusters> cclus = iEvent.get(srcToken_);
+GPUClusters const& clus = ctx.get(cclus);
+
+// Directly from Event
+GPUClusters const& clus = ctx.get(iEvent, srcToken_);
+```
+
+This step is needed to
+* check that the data are on the same CUDA device
+  * if not, throw an exception (with unified memory could prefetch instead)
+* if the CUDA streams are different, synchronize between them
+
+#### Calling the CUDA kernels
+
+It is usually best to wrap the CUDA kernel calls to a separate class,
+and then call methods of that class from the EDProducer. The only
+requirement is that the CUDA stream where to queue the operations
+should be the one from the
+`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`.
+
+```cpp
+gpuAlgo.makeClustersAsync(..., ctx.stream());
+```
+
+If necessary, different CUDA streams may be used internally, but they
+should to be made to synchronize with the provided CUDA stream with
+CUDA events and `cudaStreamWaitEvent()`.
+
+
+#### Putting output
+
+The GPU data needs to be wrapped to `CUDAProduct<T>` template with
+`CUDAScopedContextProduce::wrap()` or `CUDAScopedContextProduce::emplace()`
+
+```cpp
+GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
+std::unique_ptr<CUDA<GPUClusters>> ret = ctx.wrap(clusters);
+iEvent.put(std::move(ret));
+
+// or with one line
+iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
+
+// or avoid one unique_ptr with emplace
+edm::PutTokenT<CUDA<GPUClusters>> putToken_ = produces<CUDA<GPUClusters>>(); // in constructor
+...
+ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
+```
+
+This step is needed to
+* store the current device and CUDA stream into `CUDAProduct<T>`
+* record the CUDA event needed for CUDA stream synchronization
+
+#### `ExternalWork` extension
+
+Everything above works both with and without `ExternalWork`.
+
+Without `ExternalWork` the `EDProducer`s act similar to TBB
+flowgraph's "streaming node". In other words, they just queue more
+asynchronous work to the CUDA stream in their `produce()`.
+
+The `ExternalWork` is needed when one would otherwise call
+`cudeStreamSynchronize()`. For example transferring something to CPU
+needed for downstream DQM, or queueing more asynchronous work. With
+`ExternalWork` an `acquire()` method needs to be implemented that gets
+an `edm::WaitingTaskWithArenaHolder` parameter. The
+`edm::WaitingTaskWithArenaHolder` should then be passed to the
+constructor of `CUDAScopedContextAcquire` along
+
+```cpp
+void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<GPUClusters> const& cclus = iEvent.get(token_);
+  CUDAScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
+  ...
+```
+
+When constructed this way, `CUDAScopedContextAcquire` registers a
+callback function to the CUDA stream in its destructor to call
+`waitingTaskHolder.doneWaiting()`.
+
+A GPU->GPU producer needs a `CUDAScopedContext` also in its
+`produce()`. The device and CUDA stream are transferred via
+`CUDAContextState` member variable:
+
+```cpp
+class FooProducerCUDA ... {
+  ...
+  CUDAContextState ctxState_;
+};
+
+void acquire(...) {
+  ...
+  FooProducerCUDA::CUDAScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
+  ...
+}
+
+void produce(...( {
+  ...
+  FooProducerCUDA::CUDAScopedContextProduce ctx{ctxState_};
+}
+```
+
+The `CUDAScopedContextAcquire` saves its state to the `ctxState_` in
+the destructor, and `CUDAScopedContextProduce` then restores the
+context.
+
+#### Module-internal chain of CPU and GPU tasks
+
+Technically `ExternalWork` works such that the framework calls
+`acquire()` with a `edm::WaitingTaskWithArenaHolder` that holds an
+`edm::WaitingTask` (that inherits from `tbb::task`) for calling
+`produce()` in a `std::shared_ptr` semantics: spawn the task when
+reference count hits `0`. It is also possible to create a longer chain
+of such tasks, alternating between CPU and GPU work. This mechanism
+can also be used to re-run (part of) the GPU work.
+
+The "next tasks" to run are essentially structured as a stack, such
+that
+- `CUDAScopedContextAcquire`/`CUDAScopedContextTask::pushNextTask()`
+  pushes a new functor on top of the stack
+- Completion of both the asynchronous work and the queueing function
+  pops the top task of the stack and enqueues it (so that TBB
+  eventually runs the task)
+  * Technically the task is made eligible to run when all copies of
+    `edm::WaitingTaskWithArenaHolder` of the acquire() (or "previous"
+    function) have either been destructed or their `doneWaiting()` has
+    been called
+  * The code calling `acquire()` or the functor holds one copy of
+    `edm::WaitingTaskWithArenaHolder` so it is guaranteed that the
+    next function will not run before the earlier one has finished
+
+
+Below is an example how to push a functor on top of the stack of tasks
+to run next (following the example of the previous section)
+```cpp
+void FooProducerCUDA::acquire(...) {
+   ...
+   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+     ...
+   });
+   ...
+}
+```
+
+In this case the `ctx`argument to the function is a
+`CUDAScopedContexTask` object constructed by the TBB task calling the
+user-given function. It follows that the current device and CUDA
+stream have been set up already. The `pushNextTask()` can be called
+many times. On each invocation the `pushNextTask()` pushes a new task
+on top of the stack (i.e. in front of the chain). It follows that in
+```cpp
+void FooProducerCUDA::acquire(...) {
+   ...
+   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+     ... // function 1
+   });
+   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+     ... // function 2
+   });
+   ctx.pushNextTask([this](CUDAScopedContextTask ctx) {
+     ... // function 3
+   });
+   ...
+}
+```
+the functions will be run in the order 3, 2, 1.
+
+**Note** that the `CUDAService` is **not** available (nor is any other
+service) in these intermediate tasks. In the near future memory
+allocations etc. will be made possible by taking them out from the
+`CUDAService`.
+
+The `CUDAScopedContextAcquire`/`CUDAScopedContextTask` have also a
+more generic member function, `replaceWaitingTaskHolder()`, that can
+be used to just replace the currently-hold
+`edm::WaitingTaskWithArenaHolder` (that will get notified by the
+callback function) with anything. In this case the caller is
+responsible of creating the task(s) and setting up the chain of them.
+
+
+#### Transferring GPU data to CPU
+
+The GPU->CPU data transfer needs synchronization to ensure the CPU
+memory to have all data before putting that to the event. This means
+the `ExternalWork` needs to be used along
+* In `acquire()`
+  * (allocate CPU memory buffers)
+  * Queue all GPU->CPU transfers asynchronously
+* In `produce()`
+  * If needed, read additional CPU products (e.g. from `edm::Ref`s)
+  * Reformat data back to legacy data formats
+  * Note: `CUDAScopedContextProduce` is **not** needed in `produce()`
+
+#### Synchronizing between CUDA streams
+
+In case the producer needs input data that were produced in two (or
+more) CUDA streams, these streams have to be synchronized. Here this
+synchronization is achieved with CUDA events.
+
+Each `CUDAProduct<T>` constains also a CUDA event object. The call to
+`CUDAScopedContextProduce::wrap()` will *record* the event in the CUDA
+stream. This means that when all work queued to the CUDA stream up to
+that point has been finished, the CUDA event becomes *occurred*. Then,
+in
+`CUDAScopedContextAcquire::get()`/`CUDAScopedContextProduce::get()`/`CUDAScopedContextAnalyze::get()`,
+if the `CUDAProduct<T>` to get from has a different CUDA stream than
+the
+`CUDAScopedContextAcquire`/`CUDAScopedContextProduce`/`CUDAScopedContextAnalyze`,
+`cudaStreamWaitEvent(stream, event)` is called. This means that all
+subsequent work queued to the CUDA stream will wait for the CUDA event
+to become occurred. Therefore this subsequent work can assume that the
+to-be-getted CUDA product exists.
+
+
+### CUDA ESProduct
+
+Conditions data can be transferred to the device with the following
+pattern.
+
+1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload")
+2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory
+3. The wrapper should have a function returning the payload on the
+   device memory. The function should transfer the data to the device
+   asynchronously with the help of `CUDAESProduct<T>`.
+
+#### Example
+
+```cpp
+#include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
+
+// Declare the struct for the payload to be transferred. Here the
+// example is an array with (potentially) dynamic size. Note that all of
+// below becomes simpler if the array has compile-time size.
+struct ESProductExampleCUDA {
+  float *someData;
+  unsigned int size;
+};
+
+// Declare the wrapper ESProduct. The corresponding ESProducer should
+// produce objects of this type.
+class ESProductExampleCUDAWrapper {
+public:
+  // Constructor takes the standard CPU ESProduct, and transforms the
+  // necessary data to array(s) in pinned host memory
+  ESProductExampleCUDAWrapper(ESProductExample const&);
+
+  // Deallocates all pinned host memory
+  ~ESProductExampleCUDAWrapper();
+
+  // Function to return the actual payload on the memory of the current device
+  ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const;
+
+private:
+  // Holds the data in pinned CPU memory
+  float *someData_;
+  unsigned int size_;
+
+  // Helper struct to hold all information that has to be allocated and
+  // deallocated per device
+  struct GPUData {
+    // Destructor should free all member pointers
+    ~GPUData();
+    // internal pointers are on device, struct itself is on CPU
+    ESProductExampleCUDA *esproductHost = nullptr;
+    // internal pounters and struct are on device
+    ESProductExampleCUDA *esproductDevice = nullptr;
+  };
+
+  // Helper that takes care of complexity of transferring the data to
+  // multiple devices
+  CUDAESProduct<GPUData> gpuData_;
+};
+
+ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) {
+  cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS));
+  // fill someData_ and size_ from cpuProduct
+}
+
+ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const {
+  // CUDAESProduct<T> essentially holds an array of GPUData objects,
+  // one per device. If the data have already been transferred to the
+  // current device (or the transfer has been queued), the helper just
+  // returns a reference to that GPUData object. Otherwise, i.e. data are
+  // not yet on the current device, the helper calls the lambda to do the
+  // necessary memory allocations and to queue the transfers.
+  auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) {
+    // Allocate memory. Currently this can be with the CUDA API,
+    // sometime we'll migrate to the caching allocator. Assumption is
+    // that IOV changes are rare enough that adding global synchronization
+    // points is not that bad (for now).
+
+    // Allocate the payload object on pinned host memory.
+    cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA)));
+    // Allocate the payload array(s) on device memory.
+    cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS));
+
+    // Allocate the payload object on the device memory.
+    cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice)));
+
+    // Complete the host-side information on the payload
+    data.cablingMapHost->size = this->size_;
+
+
+    // Transfer the payload, first the array(s) ...
+    cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream));
+    // ... and then the payload object
+    cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream));
+});
+
+  // Returns the payload object on the memory of the current device
+  return data.esproductDevice;
+}
+
+// Destructor frees all member pointers
+ESProductExampleCUDA::GPUData::~GPUData() {
+  if(esproductHost != nullptr) {
+    cudaCheck(cudaFree(esproductHost->someData));
+    cudaCheck(cudaFreeHost(esproductHost));
+  }
+  cudaCheck(cudaFree(esProductDevice));
+}
+
+```
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextState.h b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h
new file mode 100644
index 0000000000000..b3c20dcb73159
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAContextState.h
@@ -0,0 +1,57 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAContextState_h
+#define HeterogeneousCore_CUDACore_CUDAContextState_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+#include <memory>
+
+/**
+ * The purpose of this class is to deliver the device and CUDA stream
+ * information from ExternalWork's acquire() to producer() via a
+ * member/StreamCache variable.
+ */
+class CUDAContextState {
+public:
+  CUDAContextState() = default;
+  ~CUDAContextState() = default;
+
+  CUDAContextState(const CUDAContextState&) = delete;
+  CUDAContextState& operator=(const CUDAContextState&) = delete;
+  CUDAContextState(CUDAContextState&&) = delete;
+  CUDAContextState& operator=(CUDAContextState&& other) = delete;
+
+private:
+  friend class CUDAScopedContextAcquire;
+  friend class CUDAScopedContextProduce;
+  friend class CUDAScopedContextTask;
+
+  void set(int device, cudautils::SharedStreamPtr stream) {
+    throwIfStream();
+    device_ = device;
+    stream_ = std::move(stream);
+  }
+
+  int device() const { return device_; }
+
+  const cudautils::SharedStreamPtr& streamPtr() const {
+    throwIfNoStream();
+    return stream_;
+  }
+
+  cudautils::SharedStreamPtr releaseStreamPtr() {
+    throwIfNoStream();
+    // This function needs to effectively reset stream_ (i.e. stream_
+    // must be empty after this function). This behavior ensures that
+    // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
+    // the next event), and is checked at run time.
+    return std::move(stream_);
+  }
+
+  void throwIfStream() const;
+  void throwIfNoStream() const;
+
+  cudautils::SharedStreamPtr stream_;
+  int device_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
new file mode 100644
index 0000000000000..b8b230e510fa3
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAESProduct.h
@@ -0,0 +1,100 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAESProduct_h
+#define HeterogeneousCore_CUDACore_CUDAESProduct_h
+
+#include <atomic>
+#include <cassert>
+#include <mutex>
+#include <vector>
+
+#include "FWCore/Concurrency/interface/hardware_pause.h"
+#include "FWCore/Utilities/interface/thread_safety_macros.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
+
+template <typename T>
+class CUDAESProduct {
+public:
+  CUDAESProduct() : gpuDataPerDevice_(cudautils::cudaDeviceCount()) {
+    for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
+      gpuDataPerDevice_[i].m_event = cudautils::getCUDAEventCache().getCUDAEvent();
+    }
+  }
+  ~CUDAESProduct() = default;
+
+  // transferAsync should be a function of (T&, cudaStream_t)
+  // which enqueues asynchronous transfers (possibly kernels as well)
+  // to the CUDA stream
+  template <typename F>
+  const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
+    auto device = cudautils::currentDevice();
+
+    auto& data = gpuDataPerDevice_[device];
+
+    // If GPU data has already been filled, we can return it
+    // immediately
+    if (not data.m_filled.load()) {
+      // It wasn't, so need to fill it
+      std::scoped_lock<std::mutex> lk{data.m_mutex};
+
+      if (data.m_filled.load()) {
+        // Other thread marked it filled while we were locking the mutex, so we're free to return it
+        return data.m_data;
+      }
+
+      if (data.m_fillingStream != nullptr) {
+        // Someone else is filling
+
+        // Check first if the recorded event has occurred
+        if (cudautils::eventIsOccurred(data.m_event.get())) {
+          // It was, so data is accessible from all CUDA streams on
+          // the device. Set the 'filled' for all subsequent calls and
+          // return the value
+          auto should_be_false = data.m_filled.exchange(true);
+          assert(not should_be_false);
+          data.m_fillingStream = nullptr;
+        } else if (data.m_fillingStream != cudaStream) {
+          // Filling is still going on. For other CUDA stream, add
+          // wait on the CUDA stream and return the value. Subsequent
+          // work queued on the stream will wait for the event to
+          // occur (i.e. transfer to finish).
+          cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
+                    "Failed to make a stream to wait for an event");
+        }
+        // else: filling is still going on. But for the same CUDA
+        // stream (which would be a bit strange but fine), we can just
+        // return as all subsequent work should be enqueued to the
+        // same CUDA stream (or stream to be explicitly synchronized
+        // by the caller)
+      } else {
+        // Now we can be sure that the data is not yet on the GPU, and
+        // this thread is the first to try that.
+        transferAsync(data.m_data, cudaStream);
+        assert(data.m_fillingStream == nullptr);
+        data.m_fillingStream = cudaStream;
+        // Now the filling has been enqueued to the cudaStream, so we
+        // can return the GPU data immediately, since all subsequent
+        // work must be either enqueued to the cudaStream, or the cudaStream
+        // must be synchronized by the caller
+      }
+    }
+
+    return data.m_data;
+  }
+
+private:
+  struct Item {
+    mutable std::mutex m_mutex;
+    CMS_THREAD_GUARD(m_mutex) mutable cudautils::SharedEventPtr m_event;
+    // non-null if some thread is already filling (cudaStream_t is just a pointer)
+    CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
+    mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
+    CMS_THREAD_GUARD(m_mutex) mutable T m_data;
+  };
+
+  std::vector<Item> gpuDataPerDevice_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
new file mode 100644
index 0000000000000..758218bb958a2
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -0,0 +1,252 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAScopedContext_h
+#define HeterogeneousCore_CUDACore_CUDAScopedContext_h
+
+#include <optional>
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
+#include "FWCore/Utilities/interface/EDPutToken.h"
+#include "FWCore/Utilities/interface/StreamID.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+namespace cudatest {
+  class TestCUDAScopedContext;
+}
+
+namespace impl {
+  // This class is intended to be derived by other CUDAScopedContext*, not for general use
+  class CUDAScopedContextBase {
+  public:
+    int device() const { return currentDevice_; }
+
+    // cudaStream_t is a pointer to a thread-safe object, for which a
+    // mutable access is needed even if the CUDAScopedContext itself
+    // would be const. Therefore it is ok to return a non-const
+    // pointer from a const method here.
+    cudaStream_t stream() const { return stream_.get(); }
+    const cudautils::SharedStreamPtr& streamPtr() const { return stream_; }
+
+  protected:
+    // The constructors set the current device device, but the device
+    // is not set back to the previous value at the destructor. This
+    // should be sufficient (and tiny bit faster) as all CUDA API
+    // functions relying on the current device should be called from
+    // the scope where this context is. The current device doesn't
+    // really matter between modules (or across TBB tasks).
+    explicit CUDAScopedContextBase(edm::StreamID streamID);
+
+    explicit CUDAScopedContextBase(const CUDAProductBase& data);
+
+    explicit CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream);
+
+  private:
+    int currentDevice_;
+    cudautils::SharedStreamPtr stream_;
+  };
+
+  class CUDAScopedContextGetterBase : public CUDAScopedContextBase {
+  public:
+    template <typename T>
+    const T& get(const CUDAProduct<T>& data) {
+      synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
+      return data.data_;
+    }
+
+    template <typename T>
+    const T& get(const edm::Event& iEvent, edm::EDGetTokenT<CUDAProduct<T>> token) {
+      return get(iEvent.get(token));
+    }
+
+  protected:
+    template <typename... Args>
+    CUDAScopedContextGetterBase(Args&&... args) : CUDAScopedContextBase(std::forward<Args>(args)...) {}
+
+    void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
+  };
+
+  class CUDAScopedContextHolderHelper {
+  public:
+    CUDAScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+        : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
+
+    template <typename F>
+    void pushNextTask(F&& f, CUDAContextState const* state);
+
+    void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+      waitingTaskHolder_ = std::move(waitingTaskHolder);
+    }
+
+    void enqueueCallback(int device, cudaStream_t stream);
+
+  private:
+    edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
+  };
+}  // namespace impl
+
+/**
+ * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
+ * - setting the current device
+ * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+class CUDAScopedContextAcquire : public impl::CUDAScopedContextGetterBase {
+public:
+  /// Constructor to create a new CUDA stream (no need for context beyond acquire())
+  explicit CUDAScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+      : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
+
+  /// Constructor to create a new CUDA stream, and the context is needed after acquire()
+  explicit CUDAScopedContextAcquire(edm::StreamID streamID,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    CUDAContextState& state)
+      : CUDAScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+  /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
+  explicit CUDAScopedContextAcquire(const CUDAProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+      : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
+
+  /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
+  explicit CUDAScopedContextAcquire(const CUDAProductBase& data,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
+                                    CUDAContextState& state)
+      : CUDAScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
+
+  ~CUDAScopedContextAcquire();
+
+  template <typename F>
+  void pushNextTask(F&& f) {
+    if (contextState_ == nullptr)
+      throwNoState();
+    holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+  }
+
+  void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+    holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+  }
+
+private:
+  void throwNoState();
+
+  impl::CUDAScopedContextHolderHelper holderHelper_;
+  CUDAContextState* contextState_ = nullptr;
+};
+
+/**
+ * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+ * - setting the current device
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+class CUDAScopedContextProduce : public impl::CUDAScopedContextGetterBase {
+public:
+  /// Constructor to create a new CUDA stream (non-ExternalWork module)
+  explicit CUDAScopedContextProduce(edm::StreamID streamID) : CUDAScopedContextGetterBase(streamID) {}
+
+  /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
+  explicit CUDAScopedContextProduce(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {}
+
+  /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+  explicit CUDAScopedContextProduce(CUDAContextState& state)
+      : CUDAScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
+
+  ~CUDAScopedContextProduce();
+
+  template <typename T>
+  std::unique_ptr<CUDAProduct<T>> wrap(T data) {
+    // make_unique doesn't work because of private constructor
+    //
+    // CUDAProduct<T> constructor records CUDA event to the CUDA
+    // stream. The event will become "occurred" after all work queued
+    // to the stream before this point has been finished.
+    std::unique_ptr<CUDAProduct<T>> ret(new CUDAProduct<T>(device(), streamPtr(), std::move(data)));
+    createEventIfStreamBusy();
+    ret->setEvent(event_);
+    return ret;
+  }
+
+  template <typename T, typename... Args>
+  auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+    auto ret = iEvent.emplace(token, device(), streamPtr(), std::forward<Args>(args)...);
+    createEventIfStreamBusy();
+    const_cast<T&>(*ret).setEvent(event_);
+    return ret;
+  }
+
+private:
+  friend class cudatest::TestCUDAScopedContext;
+
+  // This construcor is only meant for testing
+  explicit CUDAScopedContextProduce(int device, cudautils::SharedStreamPtr stream, cudautils::SharedEventPtr event)
+      : CUDAScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
+
+  void createEventIfStreamBusy();
+
+  cudautils::SharedEventPtr event_;
+};
+
+/**
+ * The aim of this class is to do necessary per-task "initialization" tasks created in ExternalWork acquire():
+ * - setting the current device
+ * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+class CUDAScopedContextTask : public impl::CUDAScopedContextBase {
+public:
+  /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
+  explicit CUDAScopedContextTask(CUDAContextState const* state, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
+      : CUDAScopedContextBase(state->device(), state->streamPtr()),  // don't move, state is re-used afterwards
+        holderHelper_{std::move(waitingTaskHolder)},
+        contextState_{state} {}
+
+  ~CUDAScopedContextTask();
+
+  template <typename F>
+  void pushNextTask(F&& f) {
+    holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
+  }
+
+  void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+    holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
+  }
+
+private:
+  impl::CUDAScopedContextHolderHelper holderHelper_;
+  CUDAContextState const* contextState_;
+};
+
+/**
+ * The aim of this class is to do necessary per-event "initialization" in analyze()
+ * - setting the current device
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+/**
+ * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
+ * - setting the current device
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+class CUDAScopedContextAnalyze : public impl::CUDAScopedContextGetterBase {
+public:
+  /// Constructor to (possibly) re-use a CUDA stream
+  explicit CUDAScopedContextAnalyze(const CUDAProductBase& data) : CUDAScopedContextGetterBase(data) {}
+};
+
+namespace impl {
+  template <typename F>
+  void CUDAScopedContextHolderHelper::pushNextTask(F&& f, CUDAContextState const* state) {
+    replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder{
+        edm::make_waiting_task_with_holder(tbb::task::allocate_root(),
+                                           std::move(waitingTaskHolder_),
+                                           [state, func = std::forward<F>(f)](edm::WaitingTaskWithArenaHolder h) {
+                                             func(CUDAScopedContextTask{state, std::move(h)});
+                                           })});
+  }
+}  // namespace impl
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
new file mode 100644
index 0000000000000..ded114e2fddfe
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
@@ -0,0 +1,34 @@
+import FWCore.ParameterSet.Config as cms
+
+_cuda_enabled_cached = None
+
+def _switch_cuda():
+    global _cuda_enabled_cached
+    if _cuda_enabled_cached is None:
+        import os
+        _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0)
+    return (_cuda_enabled_cached, 2)
+
+class SwitchProducerCUDA(cms.SwitchProducer):
+    def __init__(self, **kargs):
+        super(SwitchProducerCUDA,self).__init__(
+            dict(cpu = cms.SwitchProducer.getCpu(),
+                 cuda = _switch_cuda),
+            **kargs
+        )
+cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA")
+
+if __name__ == "__main__":
+    import unittest
+
+    class TestSwitchProducerCUDA(unittest.TestCase):
+        def testPickle(self):
+            import pickle
+            sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar"))
+            pkl = pickle.dumps(sp)
+            unpkl = pickle.loads(pkl)
+            self.assertEqual(unpkl.cpu.type_(), "Foo")
+            self.assertEqual(unpkl.cuda.type_(), "Bar")
+
+    unittest.main()
+
diff --git a/HeterogeneousCore/CUDACore/src/CUDAContextState.cc b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc
new file mode 100644
index 0000000000000..bcdbae89d9094
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/CUDAContextState.cc
@@ -0,0 +1,14 @@
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+void CUDAContextState::throwIfStream() const {
+  if (stream_) {
+    throw cms::Exception("LogicError") << "Trying to set CUDAContextState, but it already had a valid state";
+  }
+}
+
+void CUDAContextState::throwIfNoStream() const {
+  if (not stream_) {
+    throw cms::Exception("LogicError") << "Trying to get CUDAContextState, but it did not have a valid state";
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
new file mode 100644
index 0000000000000..df56c318e22fa
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -0,0 +1,134 @@
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "chooseCUDADevice.h"
+
+namespace {
+  struct CallbackData {
+    edm::WaitingTaskWithArenaHolder holder;
+    int device;
+  };
+
+  void CUDART_CB cudaScopedContextCallback(cudaStream_t streamId, cudaError_t status, void* data) {
+    std::unique_ptr<CallbackData> guard{reinterpret_cast<CallbackData*>(data)};
+    edm::WaitingTaskWithArenaHolder& waitingTaskHolder = guard->holder;
+    int device = guard->device;
+    if (status == cudaSuccess) {
+      LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream "
+                                    << streamId;
+      waitingTaskHolder.doneWaiting(nullptr);
+    } else {
+      // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+      try {
+        auto error = cudaGetErrorName(status);
+        auto message = cudaGetErrorString(status);
+        throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device
+                                          << " error " << error << ": " << message;
+      } catch (cms::Exception&) {
+        waitingTaskHolder.doneWaiting(std::current_exception());
+      }
+    }
+  }
+}  // namespace
+
+namespace impl {
+  CUDAScopedContextBase::CUDAScopedContextBase(edm::StreamID streamID)
+      : currentDevice_(cudacore::chooseCUDADevice(streamID)) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+    stream_ = cudautils::getCUDAStreamCache().getCUDAStream();
+  }
+
+  CUDAScopedContextBase::CUDAScopedContextBase(const CUDAProductBase& data) : currentDevice_(data.device()) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+    if (data.mayReuseStream()) {
+      stream_ = data.streamPtr();
+    } else {
+      stream_ = cudautils::getCUDAStreamCache().getCUDAStream();
+    }
+  }
+
+  CUDAScopedContextBase::CUDAScopedContextBase(int device, cudautils::SharedStreamPtr stream)
+      : currentDevice_(device), stream_(std::move(stream)) {
+    cudaCheck(cudaSetDevice(currentDevice_));
+  }
+
+  ////////////////////
+
+  void CUDAScopedContextGetterBase::synchronizeStreams(int dataDevice,
+                                                       cudaStream_t dataStream,
+                                                       bool available,
+                                                       cudaEvent_t dataEvent) {
+    if (dataDevice != device()) {
+      // Eventually replace with prefetch to current device (assuming unified memory works)
+      // If we won't go to unified memory, need to figure out something else...
+      throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
+    }
+
+    if (dataStream != stream()) {
+      // Different streams, need to synchronize
+      if (not available) {
+        // Event not yet occurred, so need to add synchronization
+        // here. Sychronization is done by making the CUDA stream to
+        // wait for an event, so all subsequent work in the stream
+        // will run only after the event has "occurred" (i.e. data
+        // product became available).
+        cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
+      }
+    }
+  }
+
+  void CUDAScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
+    cudaCheck(
+        cudaStreamAddCallback(stream, cudaScopedContextCallback, new CallbackData{waitingTaskHolder_, device}, 0));
+  }
+}  // namespace impl
+
+////////////////////
+
+CUDAScopedContextAcquire::~CUDAScopedContextAcquire() {
+  holderHelper_.enqueueCallback(device(), stream());
+  if (contextState_) {
+    contextState_->set(device(), std::move(streamPtr()));
+  }
+}
+
+void CUDAScopedContextAcquire::throwNoState() {
+  throw cms::Exception("LogicError")
+      << "Calling CUDAScopedContextAcquire::insertNextTask() requires CUDAScopedContextAcquire to be constructed with "
+         "CUDAContextState, but that was not the case";
+}
+
+////////////////////
+
+CUDAScopedContextProduce::~CUDAScopedContextProduce() {
+  if (event_) {
+    cudaCheck(cudaEventRecord(event_.get(), stream()));
+  }
+}
+
+void CUDAScopedContextProduce::createEventIfStreamBusy() {
+  if (event_) {
+    return;
+  }
+  auto ret = cudaStreamQuery(stream());
+  if (ret == cudaSuccess) {
+    return;
+  }
+  if (ret != cudaErrorNotReady) {
+    // cudaErrorNotReady indicates that the stream is busy, and thus
+    // is not an error
+    cudaCheck(ret);
+  }
+
+  event_ = cudautils::getCUDAEventCache().getCUDAEvent();
+}
+
+////////////////////
+
+CUDAScopedContextTask::~CUDAScopedContextTask() { holderHelper_.enqueueCallback(device(), stream()); }
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
new file mode 100644
index 0000000000000..7e9ac2faed380
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -0,0 +1,18 @@
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseCUDADevice.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id) {
+    edm::Service<CUDAService> cudaService;
+
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+    //
+    // TODO: improve the "assignment" logic
+    return id % cudaService->numberOfDevices();
+  }
+}  // namespace cudacore
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
new file mode 100644
index 0000000000000..bb09c302af7f5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h
+#define HeterogeneousCore_CUDACore_chooseCUDADevice_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
new file mode 100644
index 0000000000000..a6f34c70e8822
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -0,0 +1,16 @@
+<bin file="test_*.cc test_*.cu" name="testHeterogeneousCoreCUDACore">
+  <use name="CUDADataFormats/Common"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="FWCore/ParameterSetReader"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ServiceRegistry"/>
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAServices"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
+<bin file="testStreamEvent.cu" name="testHeterogeneousCoreCUDACoreStreamEvent">
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+</bin>
diff --git a/HeterogeneousCore/CUDACore/test/testStreamEvent.cu b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
new file mode 100644
index 0000000000000..f819a78f698e4
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/testStreamEvent.cu
@@ -0,0 +1,134 @@
+/**
+ * The purpose of this test program is to ensure that the logic for
+ * CUDA event use in CUDAProduct and CUDAScopedContext
+ */
+
+#include <iostream>
+#include <memory>
+#include <type_traits>
+#include <chrono>
+#include <thread>
+#include <cassert>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+namespace {
+  constexpr int ARRAY_SIZE = 20000000;
+  constexpr int NLOOPS = 10;
+}  // namespace
+
+__global__ void kernel_looping(float *point, unsigned int num) {
+  unsigned int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+  for (int iloop = 0; iloop < NLOOPS; ++iloop) {
+    for (size_t offset = idx; offset < num; offset += gridDim.x * blockDim.x) {
+      point[offset] += 1;
+    }
+  }
+}
+
+int main() {
+  requireCUDADevices();
+
+  constexpr bool debug = false;
+
+  float *dev_points1;
+  float *host_points1;
+  cudaStream_t stream1, stream2;
+  cudaEvent_t event1, event2;
+
+  cudaCheck(cudaMalloc(&dev_points1, ARRAY_SIZE * sizeof(float)));
+  cudaCheck(cudaMallocHost(&host_points1, ARRAY_SIZE * sizeof(float)));
+  cudaStreamCreateWithFlags(&stream1, cudaStreamNonBlocking);
+  cudaStreamCreateWithFlags(&stream2, cudaStreamNonBlocking);
+  cudaEventCreate(&event1);
+  cudaEventCreate(&event2);
+
+  for (size_t j = 0; j < ARRAY_SIZE; ++j) {
+    host_points1[j] = static_cast<float>(j);
+  }
+
+  cudaCheck(cudaMemcpyAsync(dev_points1, host_points1, ARRAY_SIZE * sizeof(float), cudaMemcpyHostToDevice, stream1));
+  kernel_looping<<<1, 16, 0, stream1>>>(dev_points1, ARRAY_SIZE);
+  if (debug)
+    std::cout << "Kernel launched on stream1" << std::endl;
+
+  auto status = cudaStreamQuery(stream1);
+  if (debug)
+    std::cout << "Stream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+  cudaEventRecord(event1, stream1);
+  status = cudaEventQuery(event1);
+  if (debug)
+    std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaErrorNotReady);
+
+  status = cudaStreamQuery(stream2);
+  if (debug)
+    std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+  assert(status == cudaSuccess);
+  if (debug) {
+    cudaEventRecord(event2, stream2);
+    status = cudaEventQuery(event2);
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    status = cudaEventQuery(event2);
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  }
+
+  cudaStreamWaitEvent(stream2, event1, 0);
+  if (debug)
+    std::cout << "\nStream2 waiting for event1" << std::endl;
+  status = cudaStreamQuery(stream2);
+  if (debug)
+    std::cout << "Stream2 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess) << std::endl;
+  assert(status == cudaErrorNotReady);
+  cudaEventRecord(event2, stream2);
+  status = cudaEventQuery(event2);
+  if (debug)
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaErrorNotReady);
+  if (debug) {
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+    status = cudaEventQuery(event2);
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  }
+
+  status = cudaStreamQuery(stream1);
+  if (debug) {
+    std::cout << "\nStream1 busy? " << (status == cudaErrorNotReady) << " idle? " << (status == cudaSuccess)
+              << std::endl;
+    std::cout << "Synchronizing stream1" << std::endl;
+  }
+  assert(status == cudaErrorNotReady);
+  cudaStreamSynchronize(stream1);
+  if (debug)
+    std::cout << "Synchronized stream1" << std::endl;
+
+  status = cudaEventQuery(event1);
+  if (debug)
+    std::cout << "Event1 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaSuccess);
+  status = cudaEventQuery(event2);
+  if (debug)
+    std::cout << "Event2 recorded? " << (status == cudaErrorNotReady) << " occurred? " << (status == cudaSuccess)
+              << std::endl;
+  assert(status == cudaSuccess);
+
+  cudaFree(dev_points1);
+  cudaFreeHost(host_points1);
+  cudaStreamDestroy(stream1);
+  cudaStreamDestroy(stream2);
+  cudaEventDestroy(event1);
+  cudaEventDestroy(event2);
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
new file mode 100644
index 0000000000000..219e4dfb20103
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -0,0 +1,134 @@
+#include "catch.hpp"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "FWCore/Concurrency/interface/WaitingTask.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+
+#include "test_CUDAScopedContextKernels.h"
+
+namespace cudatest {
+  class TestCUDAScopedContext {
+  public:
+    static CUDAScopedContextProduce make(int dev, bool createEvent) {
+      cudautils::SharedEventPtr event;
+      if (createEvent) {
+        event = cudautils::getCUDAEventCache().getCUDAEvent();
+      }
+      return CUDAScopedContextProduce(dev, cudautils::getCUDAStreamCache().getCUDAStream(), std::move(event));
+    }
+  };
+}  // namespace cudatest
+
+namespace {
+  std::unique_ptr<CUDAProduct<int*>> produce(int device, int* d, int* h) {
+    auto ctx = cudatest::TestCUDAScopedContext::make(device, true);
+    cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
+    testCUDAScopedContextKernels_single(d, ctx.stream());
+    return ctx.wrap(d);
+  }
+}  // namespace
+
+TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  {
+    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice, true);
+
+    SECTION("Construct from device ID") { REQUIRE(cudautils::currentDevice() == defaultDevice); }
+
+    SECTION("Wrap T to CUDAProduct<T>") {
+      std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+      REQUIRE(dataPtr.get() != nullptr);
+      REQUIRE(dataPtr->device() == ctx.device());
+      REQUIRE(dataPtr->stream() == ctx.stream());
+    }
+
+    SECTION("Construct from from CUDAProduct<T>") {
+      std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+      const auto& data = *dataPtr;
+
+      CUDAScopedContextProduce ctx2{data};
+      REQUIRE(cudautils::currentDevice() == data.device());
+      REQUIRE(ctx2.stream() == data.stream());
+
+      // Second use of a product should lead to new stream
+      CUDAScopedContextProduce ctx3{data};
+      REQUIRE(cudautils::currentDevice() == data.device());
+      REQUIRE(ctx3.stream() != data.stream());
+    }
+
+    SECTION("Storing state in CUDAContextState") {
+      CUDAContextState ctxstate;
+      {  // acquire
+        std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
+        const auto& data = *dataPtr;
+        edm::WaitingTaskWithArenaHolder dummy{
+            edm::make_waiting_task(tbb::task::allocate_root(), [](std::exception_ptr const* iPtr) {})};
+        CUDAScopedContextAcquire ctx2{data, std::move(dummy), ctxstate};
+      }
+
+      {  // produce
+        CUDAScopedContextProduce ctx2{ctxstate};
+        REQUIRE(cudautils::currentDevice() == ctx.device());
+        REQUIRE(ctx2.stream() == ctx.stream());
+      }
+    }
+
+    SECTION("Joining multiple CUDA streams") {
+      cudautils::ScopedSetDevice setDeviceForThisScope(defaultDevice);
+
+      // Mimick a producer on the first CUDA stream
+      int h_a1 = 1;
+      auto d_a1 = cudautils::make_device_unique<int>(nullptr);
+      auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
+
+      // Mimick a producer on the second CUDA stream
+      int h_a2 = 2;
+      auto d_a2 = cudautils::make_device_unique<int>(nullptr);
+      auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
+
+      REQUIRE(wprod1->stream() != wprod2->stream());
+
+      // Mimick a third producer "joining" the two streams
+      CUDAScopedContextProduce ctx2{*wprod1};
+
+      auto prod1 = ctx2.get(*wprod1);
+      auto prod2 = ctx2.get(*wprod2);
+
+      auto d_a3 = cudautils::make_device_unique<int>(nullptr);
+      testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
+      cudaCheck(cudaStreamSynchronize(ctx2.stream()));
+      REQUIRE(wprod2->isAvailable());
+      REQUIRE(cudautils::eventIsOccurred(wprod2->event()));
+
+      h_a1 = 0;
+      h_a2 = 0;
+      int h_a3 = 0;
+
+      cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+      cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
+
+      REQUIRE(h_a1 == 2);
+      REQUIRE(h_a2 == 4);
+      REQUIRE(h_a3 == 6);
+    }
+  }
+
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
new file mode 100644
index 0000000000000..330e83dfd4960
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
@@ -0,0 +1,13 @@
+#include "test_CUDAScopedContextKernels.h"
+
+namespace {
+  __global__ void single_mul(int *d) { d[0] = d[0] * 2; }
+
+  __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; }
+}  // namespace
+
+void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); }
+
+void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) {
+  join_add<<<1, 1, 0, stream>>>(d1, d2, d3);
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
new file mode 100644
index 0000000000000..527a4ce71e1cb
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
@@ -0,0 +1,9 @@
+#ifndef HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
+#define HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
+
+#include <cuda_runtime.h>
+
+void testCUDAScopedContextKernels_single(int *d, cudaStream_t stream);
+void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream);
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc
new file mode 100644
index 0000000000000..2e1027598a4de
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_main.cc
@@ -0,0 +1,31 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
+
+#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h"
+#include "FWCore/PluginManager/interface/PluginManager.h"
+#include "FWCore/PluginManager/interface/standard.h"
+#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h"
+
+class ServiceRegistryListener : public Catch::TestEventListenerBase {
+public:
+  using Catch::TestEventListenerBase::TestEventListenerBase;  // inherit constructor
+
+  void testRunStarting(Catch::TestRunInfo const& testRunInfo) override {
+    edmplugin::PluginManager::configure(edmplugin::standard::config());
+
+    const std::string config{
+        R"_(import FWCore.ParameterSet.Config as cms
+process = cms.Process('Test')
+process.CUDAService = cms.Service('CUDAService')
+)_"};
+
+    std::unique_ptr<edm::ParameterSet> params;
+    edm::makeParameterSets(config, params);
+    edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params)));
+    operate_.reset(new edm::ServiceRegistry::Operate(tempToken));
+  }
+
+private:
+  std::unique_ptr<edm::ServiceRegistry::Operate> operate_;
+};
+CATCH_REGISTER_LISTENER(ServiceRegistryListener);
diff --git a/HeterogeneousCore/CUDAServices/BuildFile.xml b/HeterogeneousCore/CUDAServices/BuildFile.xml
new file mode 100644
index 0000000000000..9320cad14f285
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/BuildFile.xml
@@ -0,0 +1,11 @@
+<use name="FWCore/Framework"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/MessageLogger"/>
+<use name="HeterogeneousCore/CUDAUtilities"/>
+<use name="cuda"/>
+<use name="cub"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
new file mode 100644
index 0000000000000..041ed25ba134a
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -0,0 +1,7 @@
+<bin name="cudaComputeCapabilities" file="cudaComputeCapabilities.cpp">
+  <use name="cuda"/>
+</bin>
+
+<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
+  <use name="cuda"/>
+</bin>
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp
new file mode 100644
index 0000000000000..5a65575873116
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaComputeCapabilities.cpp
@@ -0,0 +1,23 @@
+// C++ standard headers
+#include <iomanip>
+#include <iostream>
+
+// CUDA headers
+#include <cuda_runtime.h>
+
+// CMSSW headers
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+int main() {
+  int devices = 0;
+  cudaCheck(cudaGetDeviceCount(&devices));
+
+  for (int i = 0; i < devices; ++i) {
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties(&properties, i);
+    std::cout << std::setw(4) << i << "    " << std::setw(2) << properties.major << "." << properties.minor << "    "
+              << properties.name << std::endl;
+  }
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
new file mode 100644
index 0000000000000..d901e1850bceb
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -0,0 +1,31 @@
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+int main() {
+  int devices = 0;
+  auto status = cudaGetDeviceCount(&devices);
+  if (status != cudaSuccess) {
+    return EXIT_FAILURE;
+  }
+
+  int minimumMajor = 6;  // min minor is implicitly 0
+
+  // This approach (requiring all devices are supported) is rather
+  // conservative. In principle we could consider just dropping the
+  // unsupported devices. Currently that would be easiest to achieve
+  // in CUDAService though.
+  for (int i = 0; i < devices; ++i) {
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties(&properties, i);
+
+    if (properties.major < minimumMajor) {
+      return EXIT_FAILURE;
+    }
+  }
+
+  return EXIT_SUCCESS;
+}
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
new file mode 100644
index 0000000000000..625ce40fdcdc9
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -0,0 +1,46 @@
+#ifndef HeterogeneousCore_CUDAServices_CUDAService_h
+#define HeterogeneousCore_CUDAServices_CUDAService_h
+
+#include <utility>
+#include <vector>
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace edm {
+  class ParameterSet;
+  class ActivityRegistry;
+  class ConfigurationDescriptions;
+}  // namespace edm
+
+/**
+ * TODO:
+ * - CUDA stream management?
+ *   * Not really needed until we want to pass CUDA stream objects from one module to another
+ *   * Which is not really needed until we want to go for "streaming mode"
+ *   * Until that framework's inter-module synchronization is safe (but not necessarily optimal)
+ * - Management of (preallocated) memory?
+ */
+class CUDAService {
+public:
+  CUDAService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
+  ~CUDAService();
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  bool enabled() const { return enabled_; }
+
+  int numberOfDevices() const { return numberOfDevices_; }
+
+  // major, minor
+  std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }
+
+  // Returns the id of device with most free memory. If none is found, returns -1.
+  int deviceWithMostFreeMemory() const;
+
+private:
+  int numberOfDevices_ = 0;
+  std::vector<std::pair<int, int>> computeCapabilities_;
+  bool enabled_ = false;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h
new file mode 100644
index 0000000000000..b563b98b516cf
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h
@@ -0,0 +1,9 @@
+#ifndef HeterogeneousCore_CUDAServices_numberOfCUDADevices_h
+#define HeterogeneousCore_CUDAServices_numberOfCUDADevices_h
+
+// Returns the number of CUDA devices
+// The difference wrt. the standard CUDA function is that if
+// CUDAService is disabled, this function returns 0.
+int numberOfCUDADevices();
+
+#endif
diff --git a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
index afcf86afdef75..81d4f20331ce3 100644
--- a/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/plugins/BuildFile.xml
@@ -1,18 +1,15 @@
-#Skip building plugins by dropping all files for none-AMD64 build
-<architecture match="!_amd64_">
-  <flags SKIP_FILES="*"/>
-</architecture>
-<use   name="DataFormats/Provenance"/>
-<use   name="DataFormats/Common"/>
-<use   name="DataFormats/Streamer"/>
-<use   name="FWCore/MessageLogger"/>
-<use   name="FWCore/ParameterSet"/>
-<use   name="FWCore/PluginManager"/>
-<use   name="FWCore/ServiceRegistry"/>
-<use   name="FWCore/Utilities"/>
-<use   name="FWCore/Concurrency"/>
-<use   name="FWCore/Framework"/>
-<use   name="cuda"/>
-<library   file="*.cc" name="HeterogeneousCoreCUDAServicesPlugins">
-  <flags   EDM_PLUGIN="1"/>
+<use name="cuda"/>
+<use name="DataFormats/Common"/>
+<use name="DataFormats/Provenance"/>
+<use name="DataFormats/Streamer"/>
+<use name="FWCore/Concurrency"/>
+<use name="FWCore/Framework"/>
+<use name="FWCore/MessageLogger"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/Utilities"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<library file="*.cc" name="HeterogeneousCoreCUDAServicesPlugins">
+  <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
new file mode 100644
index 0000000000000..6d8527935e334
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -0,0 +1,107 @@
+#include <iostream>
+
+#include <cuda.h>
+
+#include "DataFormats/Provenance/interface/ModuleDescription.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "FWCore/ServiceRegistry/interface/ModuleCallingContext.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace edm {
+  class StreamContext;
+}
+
+class CUDAMonitoringService {
+public:
+  CUDAMonitoringService(edm::ParameterSet const& iConfig, edm::ActivityRegistry& iRegistry);
+  ~CUDAMonitoringService() = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void postModuleConstruction(edm::ModuleDescription const& desc);
+  void postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc);
+  void postEvent(edm::StreamContext const& sc);
+
+private:
+  int numberOfDevices_ = 0;
+};
+
+CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
+  // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
+  edm::Service<CUDAService> cudaService;
+  if (!cudaService->enabled())
+    return;
+  numberOfDevices_ = cudaService->numberOfDevices();
+
+  if (config.getUntrackedParameter<bool>("memoryConstruction")) {
+    registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
+  }
+  if (config.getUntrackedParameter<bool>("memoryBeginStream")) {
+    registry.watchPostModuleBeginStream(this, &CUDAMonitoringService::postModuleBeginStream);
+  }
+  if (config.getUntrackedParameter<bool>("memoryPerEvent")) {
+    registry.watchPostEvent(this, &CUDAMonitoringService::postEvent);
+  }
+}
+
+void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+
+  desc.addUntracked<bool>("memoryConstruction", false)
+      ->setComment("Print memory information for each device after the construction of each module");
+  desc.addUntracked<bool>("memoryBeginStream", true)
+      ->setComment("Print memory information for each device after the beginStream() of each module");
+  desc.addUntracked<bool>("memoryPerEvent", true)
+      ->setComment("Print memory information for each device after each event");
+
+  descriptions.add("CUDAMonitoringService", desc);
+  descriptions.setComment(
+      "The memory information is the global state of the device. This gets confusing if there are multiple processes "
+      "running on the same device. Probably the information retrieval should be re-thought?");
+}
+
+// activity handlers
+namespace {
+  template <typename T>
+  void dumpUsedMemory(T& log, int num) {
+    int old = 0;
+    cudaCheck(cudaGetDevice(&old));
+    for (int i = 0; i < num; ++i) {
+      size_t freeMemory, totalMemory;
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+      log << "\n"
+          << i << ": " << (totalMemory - freeMemory) / (1 << 20) << " MB used / " << totalMemory / (1 << 20)
+          << " MB total";
+    }
+    cudaCheck(cudaSetDevice(old));
+  }
+}  // namespace
+
+void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
+void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " ("
+      << mcc.moduleDescription()->moduleName() << ")";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
+void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
+  auto log = edm::LogPrint("CUDAMonitoringService");
+  log << "CUDA device memory after event";
+  dumpUsedMemory(log, numberOfDevices_);
+}
+
+DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
index ec8c4deac4d4d..29fa1ab959025 100644
--- a/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/NVProfilerService.cc
@@ -41,6 +41,7 @@
 #include "FWCore/Utilities/interface/Exception.h"
 #include "FWCore/Utilities/interface/ProductKindOfType.h"
 #include "FWCore/Utilities/interface/TimeOfDay.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 using namespace std::string_literals;
 
@@ -285,9 +286,8 @@ class NVProfilerService {
 
   std::vector<std::string> highlightModules_;
   const bool showModulePrefetching_;
-  bool skipFirstEvent_;
+  const bool skipFirstEvent_;
 
-  unsigned int concurrentStreams_;
   std::atomic<bool> globalFirstEventDone_ = false;
   std::vector<std::atomic<bool>> streamFirstEventDone_;
   std::vector<nvtxRangeId_t> event_;                        // per-stream event ranges
@@ -295,49 +295,22 @@ class NVProfilerService {
   // use a tbb::concurrent_vector rather than an std::vector because its final size is not known
   tbb::concurrent_vector<nvtxRangeId_t> global_modules_;  // global per-module events
 
-private:
-  struct Domains {
-    nvtxDomainHandle_t global;
-    std::vector<nvtxDomainHandle_t> stream;
-
-    Domains(NVProfilerService* service) {
-      global = nvtxDomainCreate("EDM Global");
-      allocate_streams(service->concurrentStreams_);
-    }
-
-    ~Domains() {
-      nvtxDomainDestroy(global);
-      for (unsigned int sid = 0; sid < stream.size(); ++sid) {
-        nvtxDomainDestroy(stream[sid]);
-      }
-    }
-
-    void allocate_streams(unsigned int streams) {
-      stream.resize(streams);
-      for (unsigned int sid = 0; sid < streams; ++sid) {
-        stream[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str());
-      }
-    }
-  };
-
-  // allow access to concurrentStreams_
-  friend struct Domains;
-
-  tbb::enumerable_thread_specific<Domains> domains_;
-
-  nvtxDomainHandle_t global_domain() { return domains_.local().global; }
-
-  nvtxDomainHandle_t stream_domain(unsigned int sid) { return domains_.local().stream.at(sid); }
+  nvtxDomainHandle_t global_domain_;               // NVTX domain for global EDM transitions
+  std::vector<nvtxDomainHandle_t> stream_domain_;  // NVTX domains for per-EDM-stream transitions
 };
 
 NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::ActivityRegistry& registry)
     : highlightModules_(config.getUntrackedParameter<std::vector<std::string>>("highlightModules")),
       showModulePrefetching_(config.getUntrackedParameter<bool>("showModulePrefetching")),
-      skipFirstEvent_(config.getUntrackedParameter<bool>("skipFirstEvent")),
-      concurrentStreams_(0),
-      domains_(this) {
+      skipFirstEvent_(config.getUntrackedParameter<bool>("skipFirstEvent")) {
+  // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
+  edm::Service<CUDAService> cudaService;
+
   std::sort(highlightModules_.begin(), highlightModules_.end());
 
+  // create the NVTX domain for global EDM transitions
+  global_domain_ = nvtxDomainCreate("EDM Global");
+
   // enables profile collection; if profiling is already enabled it has no effect
   if (not skipFirstEvent_) {
     cudaProfilerStart();
@@ -491,7 +464,13 @@ NVProfilerService::NVProfilerService(edm::ParameterSet const& config, edm::Activ
   registry.watchPostEventReadFromSource(this, &NVProfilerService::postEventReadFromSource);
 }
 
-NVProfilerService::~NVProfilerService() { cudaProfilerStop(); }
+NVProfilerService::~NVProfilerService() {
+  for (unsigned int sid = 0; sid < stream_domain_.size(); ++sid) {
+    nvtxDomainDestroy(stream_domain_[sid]);
+  }
+  nvtxDomainDestroy(global_domain_);
+  cudaProfilerStop();
+}
 
 void NVProfilerService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -517,17 +496,20 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) {
   out << "preallocate: " << bounds.maxNumberOfConcurrentRuns() << " concurrent runs, "
       << bounds.maxNumberOfConcurrentLuminosityBlocks() << " luminosity sections, " << bounds.maxNumberOfStreams()
       << " streams\nrunning on" << bounds.maxNumberOfThreads() << " threads";
-  nvtxDomainMark(global_domain(), out.str().c_str());
+  nvtxDomainMark(global_domain_, out.str().c_str());
 
-  concurrentStreams_ = bounds.maxNumberOfStreams();
-  for (auto& domain : domains_) {
-    domain.allocate_streams(concurrentStreams_);
+  auto concurrentStreams = bounds.maxNumberOfStreams();
+  // create the NVTX domains for per-EDM-stream transitions
+  stream_domain_.resize(concurrentStreams);
+  for (unsigned int sid = 0; sid < concurrentStreams; ++sid) {
+    stream_domain_[sid] = nvtxDomainCreate((boost::format("EDM Stream %d") % sid).str().c_str());
   }
-  event_.resize(concurrentStreams_);
-  stream_modules_.resize(concurrentStreams_);
+
+  event_.resize(concurrentStreams);
+  stream_modules_.resize(concurrentStreams);
   if (skipFirstEvent_) {
     globalFirstEventDone_ = false;
-    std::vector<std::atomic<bool>> tmp(concurrentStreams_);
+    std::vector<std::atomic<bool>> tmp(concurrentStreams);
     for (auto& element : tmp)
       std::atomic_init(&element, false);
     streamFirstEventDone_ = std::move(tmp);
@@ -536,86 +518,86 @@ void NVProfilerService::preallocate(edm::service::SystemBounds const& bounds) {
 
 void NVProfilerService::preBeginJob(edm::PathsAndConsumesOfModulesBase const& pathsAndConsumes,
                                     edm::ProcessContext const& pc) {
-  nvtxDomainMark(global_domain(), "preBeginJob");
+  nvtxDomainMark(global_domain_, "preBeginJob");
 
   // FIXME this probably works only in the absence of subprocesses
   // size() + 1 because pathsAndConsumes.allModules() does not include the source
   unsigned int modules = pathsAndConsumes.allModules().size() + 1;
   global_modules_.resize(modules, nvtxInvalidRangeId);
-  for (unsigned int sid = 0; sid < concurrentStreams_; ++sid) {
+  for (unsigned int sid = 0; sid < stream_modules_.size(); ++sid) {
     stream_modules_[sid].resize(modules, nvtxInvalidRangeId);
   }
 }
 
 void NVProfilerService::postBeginJob() {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainMark(global_domain(), "postBeginJob");
+    nvtxDomainMark(global_domain_, "postBeginJob");
   }
 }
 
 void NVProfilerService::postEndJob() {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainMark(global_domain(), "postEndJob");
+    nvtxDomainMark(global_domain_, "postEndJob");
   }
 }
 
 void NVProfilerService::preSourceEvent(edm::StreamID sid) {
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "source");
+    nvtxDomainRangePush(stream_domain_[sid], "source");
   }
 }
 
 void NVProfilerService::postSourceEvent(edm::StreamID sid) {
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preSourceLumi(edm::LuminosityBlockIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "source lumi");
+    nvtxDomainRangePush(global_domain_, "source lumi");
   }
 }
 
 void NVProfilerService::postSourceLumi(edm::LuminosityBlockIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preSourceRun(edm::RunIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "source run");
+    nvtxDomainRangePush(global_domain_, "source run");
   }
 }
 
 void NVProfilerService::postSourceRun(edm::RunIndex index) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preOpenFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), ("open file "s + lfn).c_str());
+    nvtxDomainRangePush(global_domain_, ("open file "s + lfn).c_str());
   }
 }
 
 void NVProfilerService::postOpenFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preCloseFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), ("close file "s + lfn).c_str());
+    nvtxDomainRangePush(global_domain_, ("close file "s + lfn).c_str());
   }
 }
 
 void NVProfilerService::postCloseFile(std::string const& lfn, bool) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
@@ -626,7 +608,7 @@ void NVProfilerService::preModuleBeginStream(edm::StreamContext const& sc, edm::
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " begin stream";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -634,7 +616,7 @@ void NVProfilerService::postModuleBeginStream(edm::StreamContext const& sc, edm:
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -646,7 +628,7 @@ void NVProfilerService::preModuleEndStream(edm::StreamContext const& sc, edm::Mo
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " end stream";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -654,124 +636,124 @@ void NVProfilerService::postModuleEndStream(edm::StreamContext const& sc, edm::M
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
 
 void NVProfilerService::preGlobalBeginRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global begin run");
+    nvtxDomainRangePush(global_domain_, "global begin run");
   }
 }
 
 void NVProfilerService::postGlobalBeginRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preGlobalEndRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global end run");
+    nvtxDomainRangePush(global_domain_, "global end run");
   }
 }
 
 void NVProfilerService::postGlobalEndRun(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preStreamBeginRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "stream begin run");
+    nvtxDomainRangePush(stream_domain_[sid], "stream begin run");
   }
 }
 
 void NVProfilerService::postStreamBeginRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preStreamEndRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "stream end run");
+    nvtxDomainRangePush(stream_domain_[sid], "stream end run");
   }
 }
 
 void NVProfilerService::postStreamEndRun(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preGlobalBeginLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global begin lumi");
+    nvtxDomainRangePush(global_domain_, "global begin lumi");
   }
 }
 
 void NVProfilerService::postGlobalBeginLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preGlobalEndLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePush(global_domain(), "global end lumi");
+    nvtxDomainRangePush(global_domain_, "global end lumi");
   }
 }
 
 void NVProfilerService::postGlobalEndLumi(edm::GlobalContext const& gc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
-    nvtxDomainRangePop(global_domain());
+    nvtxDomainRangePop(global_domain_);
   }
 }
 
 void NVProfilerService::preStreamBeginLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePush(stream_domain(sid), "stream begin lumi");
+    nvtxDomainRangePush(stream_domain_[sid], "stream begin lumi");
   }
 }
 
 void NVProfilerService::postStreamBeginLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preStreamEndLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
-  nvtxDomainRangePush(stream_domain(sid), "stream end lumi");
+  nvtxDomainRangePush(stream_domain_[sid], "stream end lumi");
 }
 
 void NVProfilerService::postStreamEndLumi(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangePop(stream_domain(sid));
+    nvtxDomainRangePop(stream_domain_[sid]);
   }
 }
 
 void NVProfilerService::preEvent(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    event_[sid] = nvtxDomainRangeStartColor(stream_domain(sid), "event", nvtxDarkGreen);
+    event_[sid] = nvtxDomainRangeStartColor(stream_domain_[sid], "event", nvtxDarkGreen);
   }
 }
 
 void NVProfilerService::postEvent(edm::StreamContext const& sc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainRangeEnd(stream_domain(sid), event_[sid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], event_[sid]);
     event_[sid] = nvtxInvalidRangeId;
   } else {
     streamFirstEventDone_[sid] = true;
@@ -787,7 +769,7 @@ void NVProfilerService::postEvent(edm::StreamContext const& sc) {
 void NVProfilerService::prePathEvent(edm::StreamContext const& sc, edm::PathContext const& pc) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainMark(global_domain(), ("before path "s + pc.pathName()).c_str());
+    nvtxDomainMark(global_domain_, ("before path "s + pc.pathName()).c_str());
   }
 }
 
@@ -796,7 +778,7 @@ void NVProfilerService::postPathEvent(edm::StreamContext const& sc,
                                       edm::HLTPathStatus const& hlts) {
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
-    nvtxDomainMark(global_domain(), ("after path "s + pc.pathName()).c_str());
+    nvtxDomainMark(global_domain_, ("after path "s + pc.pathName()).c_str());
   }
 }
 
@@ -807,7 +789,7 @@ void NVProfilerService::preModuleEventPrefetching(edm::StreamContext const& sc,
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " prefetching";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label));
   }
 }
 
@@ -815,7 +797,7 @@ void NVProfilerService::postModuleEventPrefetching(edm::StreamContext const& sc,
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -826,14 +808,14 @@ void NVProfilerService::preModuleConstruction(edm::ModuleDescription const& desc
     global_modules_.grow_to_at_least(mid + 1);
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " construction";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleConstruction(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -843,14 +825,14 @@ void NVProfilerService::preModuleBeginJob(edm::ModuleDescription const& desc) {
     auto mid = desc.id();
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " begin job";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleBeginJob(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -860,14 +842,14 @@ void NVProfilerService::preModuleEndJob(edm::ModuleDescription const& desc) {
     auto mid = desc.id();
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " end job";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleEndJob(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -879,7 +861,7 @@ void NVProfilerService::preModuleEventAcquire(edm::StreamContext const& sc, edm:
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " acquire";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -887,7 +869,7 @@ void NVProfilerService::postModuleEventAcquire(edm::StreamContext const& sc, edm
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -898,7 +880,7 @@ void NVProfilerService::preModuleEvent(edm::StreamContext const& sc, edm::Module
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColor(label));
   }
 }
 
@@ -906,7 +888,7 @@ void NVProfilerService::postModuleEvent(edm::StreamContext const& sc, edm::Modul
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -919,7 +901,7 @@ void NVProfilerService::preModuleEventDelayedGet(edm::StreamContext const& sc, e
     auto const & label = mcc.moduleDescription()->moduleLabel();
     auto const & msg = label + " delayed get";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), label.c_str(), labelColorLight(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], label.c_str(), labelColorLight(label));
   }
   */
 }
@@ -929,7 +911,7 @@ void NVProfilerService::postModuleEventDelayedGet(edm::StreamContext const& sc,
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
   */
@@ -943,7 +925,7 @@ void NVProfilerService::preEventReadFromSource(edm::StreamContext const& sc, edm
     auto const & label = mcc.moduleDescription()->moduleLabel();
     auto const & msg = label + " read from source";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColorLight(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColorLight(label));
   }
   */
 }
@@ -953,7 +935,7 @@ void NVProfilerService::postEventReadFromSource(edm::StreamContext const& sc, ed
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
   */
@@ -966,7 +948,7 @@ void NVProfilerService::preModuleStreamBeginRun(edm::StreamContext const& sc, ed
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream begin run";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -974,7 +956,7 @@ void NVProfilerService::postModuleStreamBeginRun(edm::StreamContext const& sc, e
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -986,7 +968,7 @@ void NVProfilerService::preModuleStreamEndRun(edm::StreamContext const& sc, edm:
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream end run";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -994,7 +976,7 @@ void NVProfilerService::postModuleStreamEndRun(edm::StreamContext const& sc, edm
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -1006,7 +988,7 @@ void NVProfilerService::preModuleStreamBeginLumi(edm::StreamContext const& sc, e
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream begin lumi";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -1014,7 +996,7 @@ void NVProfilerService::postModuleStreamBeginLumi(edm::StreamContext const& sc,
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -1026,7 +1008,7 @@ void NVProfilerService::preModuleStreamEndLumi(edm::StreamContext const& sc, edm
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " stream end lumi";
     assert(stream_modules_[sid][mid] == nvtxInvalidRangeId);
-    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain(sid), msg.c_str(), labelColor(label));
+    stream_modules_[sid][mid] = nvtxDomainRangeStartColor(stream_domain_[sid], msg.c_str(), labelColor(label));
   }
 }
 
@@ -1034,7 +1016,7 @@ void NVProfilerService::postModuleStreamEndLumi(edm::StreamContext const& sc, ed
   auto sid = sc.streamID();
   if (not skipFirstEvent_ or streamFirstEventDone_[sid]) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(stream_domain(sid), stream_modules_[sid][mid]);
+    nvtxDomainRangeEnd(stream_domain_[sid], stream_modules_[sid][mid]);
     stream_modules_[sid][mid] = nvtxInvalidRangeId;
   }
 }
@@ -1044,14 +1026,14 @@ void NVProfilerService::preModuleGlobalBeginRun(edm::GlobalContext const& gc, ed
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global begin run";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalBeginRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1061,14 +1043,14 @@ void NVProfilerService::preModuleGlobalEndRun(edm::GlobalContext const& gc, edm:
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global end run";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalEndRun(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1078,14 +1060,14 @@ void NVProfilerService::preModuleGlobalBeginLumi(edm::GlobalContext const& gc, e
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global begin lumi";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalBeginLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1095,14 +1077,14 @@ void NVProfilerService::preModuleGlobalEndLumi(edm::GlobalContext const& gc, edm
     auto mid = mcc.moduleDescription()->id();
     auto const& label = mcc.moduleDescription()->moduleLabel();
     auto const& msg = label + " global end lumi";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postModuleGlobalEndLumi(edm::GlobalContext const& gc, edm::ModuleCallingContext const& mcc) {
   if (not skipFirstEvent_ or globalFirstEventDone_) {
     auto mid = mcc.moduleDescription()->id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
@@ -1113,14 +1095,14 @@ void NVProfilerService::preSourceConstruction(edm::ModuleDescription const& desc
     global_modules_.grow_to_at_least(mid + 1);
     auto const& label = desc.moduleLabel();
     auto const& msg = label + " construction";
-    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain(), msg.c_str(), labelColor(label));
+    global_modules_[mid] = nvtxDomainRangeStartColor(global_domain_, msg.c_str(), labelColor(label));
   }
 }
 
 void NVProfilerService::postSourceConstruction(edm::ModuleDescription const& desc) {
   if (not skipFirstEvent_) {
     auto mid = desc.id();
-    nvtxDomainRangeEnd(global_domain(), global_modules_[mid]);
+    nvtxDomainRangeEnd(global_domain_, global_modules_[mid]);
     global_modules_[mid] = nvtxInvalidRangeId;
   }
 }
diff --git a/HeterogeneousCore/CUDAServices/plugins/plugins.cc b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
new file mode 100644
index 0000000000000..d8aefa42e9c99
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/plugins/plugins.cc
@@ -0,0 +1,4 @@
+#include "FWCore/ServiceRegistry/interface/ServiceMaker.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+DEFINE_FWK_SERVICE(CUDAService);
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
new file mode 100644
index 0000000000000..bde3e26382976
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaRebuild.sh
@@ -0,0 +1,10 @@
+#! /bin/bash -e
+
+# move to the .../src directory
+cd $CMSSW_BASE/src/
+
+# check out all packages containing .cu files
+git ls-files --full-name | grep '.*\.cu$' | cut -d/ -f-2 | sort -u | xargs git cms-addpkg
+
+# rebuild all checked out packages
+scram b -j
diff --git a/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
new file mode 100644
index 0000000000000..f3335f4cd409f
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cmsCudaSetup.sh
@@ -0,0 +1,19 @@
+#! /bin/bash
+TOOL=$CMSSW_BASE/config/toolbox/$SCRAM_ARCH/tools/selected/cuda.xml
+
+# enumerate the supported streaming multiprocessor (sm) compute capabilites
+DOTS=$(cudaComputeCapabilities | awk '{ print $2 }' | sort -u)
+CAPS=$(echo $DOTS | sed -e's#\.*##g')
+
+# remove existing capabilities
+sed -i $TOOL -e'\#<flags CUDA_FLAGS="-gencode arch=compute_..,code=sm_.."/>#d'
+
+# add support for the capabilities found on this machine
+for CAP in $CAPS; do
+  sed -i $TOOL -e"\#</client>#a\  <flags CUDA_FLAGS=\"-gencode arch=compute_$CAP,code=sm_$CAP\"/>"
+done
+
+# reconfigure the cuda.xml tool
+scram setup cuda
+
+echo "SCRAM configured to support CUDA streaming multiprocessor architectures $DOTS"
diff --git a/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
new file mode 100644
index 0000000000000..331ddd30f73bd
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/cudaPreallocate.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+
+from __future__ import print_function
+import re
+import sys
+import argparse
+
+def main(opts):
+    device = []
+    host = []
+
+    device_re = re.compile("Device.*allocated new device block.*\((?P<bytes>\d+) bytes")
+    host_re = re.compile("Host.*allocated new host block.*\((?P<bytes>\d+) bytes")
+
+    f = open(opts.file)
+    for line in f:
+        m = device_re.search(line)
+        if m:
+            device.append(m.group("bytes"))
+            continue
+        m = host_re.search(line)
+        if m:
+            host.append(m.group("bytes"))
+    f.close()
+
+    print("process.CUDAService.allocator.devicePreallocate = cms.untracked.vuint32(%s)" % ",".join(device))
+    print("process.CUDAService.allocator.hostPreallocate = cms.untracked.vuint32(%s)" % ",".join(host))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="""Extract CUDAService preallocation parameters from a log file.
+
+To use, run the job once with "process.CUDAService.allocator.debug =
+True" and direct the output to a file. Then run this script by passing
+the file as an argument, and copy the output of this script back to
+the configuration file.""")
+    parser.add_argument("file", type=str, help="Log file to parse")
+    opts = parser.parse_args()
+    main(opts)
diff --git a/HeterogeneousCore/CUDAServices/scripts/nvprof-remote b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote
new file mode 100644
index 0000000000000..3b010c005291f
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/scripts/nvprof-remote
@@ -0,0 +1,23 @@
+#! /bin/bash
+
+# find the CMSSW release
+if [ -z "$CMSSW_BASE" ]; then
+  export CMSSW_BASE=$(readlink -f $(dirname $0)/../..)
+fi
+
+# load the CMS environment
+source $(< "$CMSSW_BASE"/config/scram_basedir)/cmsset_default.sh
+
+# load the CMSSW release environment
+eval `cd "$CMSSW_BASE"; scram runtime -sh 2> /dev/null`
+
+# log the commands being run
+{
+  date
+  echo "cwd: $PWD"
+  echo "cmd: $0 $@"
+  echo
+} >> $CMSSW_BASE/tmp/nvprof.log
+
+# run the CUDA profiler
+nvprof "$@"
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
new file mode 100644
index 0000000000000..1568e5bb508eb
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -0,0 +1,387 @@
+#include <iomanip>
+#include <iostream>
+#include <limits>
+
+#include <cuda.h>
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h"
+
+void setCudaLimit(cudaLimit limit, const char* name, size_t request) {
+  // read the current device
+  int device;
+  cudaCheck(cudaGetDevice(&device));
+  // try to set the requested limit
+  auto result = cudaDeviceSetLimit(limit, request);
+  if (cudaErrorUnsupportedLimit == result) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": unsupported limit \"" << name << "\"";
+    return;
+  }
+  // read back the limit value
+  size_t value;
+  cudaCheck(cudaDeviceGetLimit(&value, limit));
+  if (cudaSuccess != result) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": failed to set limit \"" << name << "\" to "
+                                   << request << ", current value is " << value;
+  } else if (value != request) {
+    edm::LogWarning("CUDAService") << "CUDA device " << device << ": limit \"" << name << "\" set to " << value
+                                   << " instead of requested " << request;
+  }
+}
+
+constexpr unsigned int getCudaCoresPerSM(unsigned int major, unsigned int minor) {
+  switch (major * 10 + minor) {
+    // Fermi architecture
+    case 20:  // SM 2.0: GF100 class
+      return 32;
+    case 21:  // SM 2.1: GF10x class
+      return 48;
+
+    // Kepler architecture
+    case 30:  // SM 3.0: GK10x class
+    case 32:  // SM 3.2: GK10x class
+    case 35:  // SM 3.5: GK11x class
+    case 37:  // SM 3.7: GK21x class
+      return 192;
+
+    // Maxwell architecture
+    case 50:  // SM 5.0: GM10x class
+    case 52:  // SM 5.2: GM20x class
+    case 53:  // SM 5.3: GM20x class
+      return 128;
+
+    // Pascal architecture
+    case 60:  // SM 6.0: GP100 class
+      return 64;
+    case 61:  // SM 6.1: GP10x class
+    case 62:  // SM 6.2: GP10x class
+      return 128;
+
+    // Volta architecture
+    case 70:  // SM 7.0: GV100 class
+    case 72:  // SM 7.2: GV11b class
+      return 64;
+
+    // Turing architecture
+    case 75:  // SM 7.5: TU10x class
+      return 64;
+
+    // unknown architecture, return a default value
+    default:
+      return 64;
+  }
+}
+
+namespace {
+  template <template <typename> typename UniquePtr, typename Allocate>
+  void preallocate(Allocate allocate, const std::vector<unsigned int>& bufferSizes) {
+    if (bufferSizes.empty())
+      return;
+
+    auto streamPtr = cudautils::getCUDAStreamCache().getCUDAStream();
+
+    std::vector<UniquePtr<char[]> > buffers;
+    buffers.reserve(bufferSizes.size());
+    for (auto size : bufferSizes) {
+      buffers.push_back(allocate(size, streamPtr.get()));
+    }
+  }
+
+  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
+    int device;
+    cudaCheck(cudaGetDevice(&device));
+    for (int i = 0; i < numberOfDevices; ++i) {
+      cudaCheck(cudaSetDevice(i));
+      preallocate<cudautils::device::unique_ptr>(
+          [&](size_t size, cudaStream_t stream) { return cudautils::make_device_unique<char[]>(size, stream); },
+          bufferSizes);
+    }
+    cudaCheck(cudaSetDevice(device));
+  }
+
+  void hostPreallocate(const std::vector<unsigned int>& bufferSizes) {
+    preallocate<cudautils::host::unique_ptr>(
+        [&](size_t size, cudaStream_t stream) { return cudautils::make_host_unique<char[]>(size, stream); },
+        bufferSizes);
+  }
+}  // namespace
+
+/// Constructor
+CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& iRegistry) {
+  bool configEnabled = config.getUntrackedParameter<bool>("enabled");
+  if (not configEnabled) {
+    edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
+    return;
+  }
+
+  auto status = cudaGetDeviceCount(&numberOfDevices_);
+  if (cudaSuccess != status) {
+    edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n"
+                                   << "Disabling the CUDAService.";
+    return;
+  }
+  edm::LogInfo log("CUDAService");
+  computeCapabilities_.reserve(numberOfDevices_);
+  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
+
+  auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
+  auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
+  auto stackSize = limits.getUntrackedParameter<int>("cudaLimitStackSize");
+  auto mallocHeapSize = limits.getUntrackedParameter<int>("cudaLimitMallocHeapSize");
+  auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
+  auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
+
+  for (int i = 0; i < numberOfDevices_; ++i) {
+    // read information about the compute device.
+    // see the documentation of cudaGetDeviceProperties() for more information.
+    cudaDeviceProp properties;
+    cudaCheck(cudaGetDeviceProperties(&properties, i));
+    log << "CUDA device " << i << ": " << properties.name << '\n';
+
+    // compute capabilities
+    log << "  compute capability:          " << properties.major << "." << properties.minor << " (sm_"
+        << properties.major << properties.minor << ")\n";
+    computeCapabilities_.emplace_back(properties.major, properties.minor);
+    log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
+    log << "  CUDA cores: " << std::setw(28)
+        << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
+    log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
+
+    // compute mode
+    static constexpr const char* computeModeDescription[] = {
+        "default (shared)",            // cudaComputeModeDefault
+        "exclusive (single thread)",   // cudaComputeModeExclusive
+        "prohibited",                  // cudaComputeModeProhibited
+        "exclusive (single process)",  // cudaComputeModeExclusiveProcess
+        "unknown"};
+    log << "  compute mode:" << std::right << std::setw(27)
+        << computeModeDescription[std::min(properties.computeMode, (int)std::size(computeModeDescription) - 1)] << '\n';
+
+    // TODO if a device is in exclusive use, skip it and remove it from the list, instead of failing with abort()
+    cudaCheck(cudaSetDevice(i));
+    cudaCheck(cudaSetDeviceFlags(cudaDeviceScheduleAuto | cudaDeviceMapHost));
+
+    // read the free and total amount of memory available for allocation by the device, in bytes.
+    // see the documentation of cudaMemGetInfo() for more information.
+    size_t freeMemory, totalMemory;
+    cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+    log << "  memory: " << std::setw(6) << freeMemory / (1 << 20) << " MB free / " << std::setw(6)
+        << totalMemory / (1 << 20) << " MB total\n";
+    log << "  constant memory:               " << std::setw(6) << properties.totalConstMem / (1 << 10) << " kB\n";
+    log << "  L2 cache size:                 " << std::setw(6) << properties.l2CacheSize / (1 << 10) << " kB\n";
+
+    // L1 cache behaviour
+    static constexpr const char* l1CacheModeDescription[] = {
+        "unknown", "local memory", "global memory", "local and global memory"};
+    int l1CacheMode = properties.localL1CacheSupported + 2 * properties.globalL1CacheSupported;
+    log << "  L1 cache mode:" << std::setw(26) << std::right << l1CacheModeDescription[l1CacheMode] << '\n';
+    log << '\n';
+
+    log << "Other capabilities\n";
+    log << "  " << (properties.canMapHostMemory ? "can" : "cannot")
+        << " map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer()\n";
+    log << "  " << (properties.pageableMemoryAccess ? "supports" : "does not support")
+        << " coherently accessing pageable memory without calling cudaHostRegister() on it\n";
+    log << "  " << (properties.pageableMemoryAccessUsesHostPageTables ? "can" : "cannot")
+        << " access pageable memory via the host's page tables\n";
+    log << "  " << (properties.canUseHostPointerForRegisteredMem ? "can" : "cannot")
+        << " access host registered memory at the same virtual address as the host\n";
+    log << "  " << (properties.unifiedAddressing ? "shares" : "does not share")
+        << " a unified address space with the host\n";
+    log << "  " << (properties.managedMemory ? "supports" : "does not support")
+        << " allocating managed memory on this system\n";
+    log << "  " << (properties.concurrentManagedAccess ? "can" : "cannot")
+        << " coherently access managed memory concurrently with the host\n";
+    log << "  "
+        << "the host " << (properties.directManagedMemAccessFromHost ? "can" : "cannot")
+        << " directly access managed memory on the device without migration\n";
+    log << "  " << (properties.cooperativeLaunch ? "supports" : "does not support")
+        << " launching cooperative kernels via cudaLaunchCooperativeKernel()\n";
+    log << "  " << (properties.cooperativeMultiDeviceLaunch ? "supports" : "does not support")
+        << " launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice()\n";
+    log << '\n';
+
+    // set and read the CUDA device flags.
+    // see the documentation of cudaSetDeviceFlags and cudaGetDeviceFlags for  more information.
+    log << "CUDA flags\n";
+    unsigned int flags;
+    cudaCheck(cudaGetDeviceFlags(&flags));
+    switch (flags & cudaDeviceScheduleMask) {
+      case cudaDeviceScheduleAuto:
+        log << "  thread policy:                   default\n";
+        break;
+      case cudaDeviceScheduleSpin:
+        log << "  thread policy:                      spin\n";
+        break;
+      case cudaDeviceScheduleYield:
+        log << "  thread policy:                     yield\n";
+        break;
+      case cudaDeviceScheduleBlockingSync:
+        log << "  thread policy:             blocking sync\n";
+        break;
+      default:
+        log << "  thread policy:                 undefined\n";
+    }
+    if (flags & cudaDeviceMapHost) {
+      log << "  pinned host memory allocations:  enabled\n";
+    } else {
+      log << "  pinned host memory allocations: disabled\n";
+    }
+    if (flags & cudaDeviceLmemResizeToMax) {
+      log << "  kernel host memory reuse:        enabled\n";
+    } else {
+      log << "  kernel host memory reuse:       disabled\n";
+    }
+    log << '\n';
+
+    // set and read the CUDA resource limits.
+    // see the documentation of cudaDeviceSetLimit() for more information.
+
+    // cudaLimitPrintfFifoSize controls the size in bytes of the shared FIFO used by the
+    // printf() device system call.
+    if (printfFifoSize >= 0) {
+      setCudaLimit(cudaLimitPrintfFifoSize, "cudaLimitPrintfFifoSize", printfFifoSize);
+    }
+    // cudaLimitStackSize controls the stack size in bytes of each GPU thread.
+    if (stackSize >= 0) {
+      setCudaLimit(cudaLimitStackSize, "cudaLimitStackSize", stackSize);
+    }
+    // cudaLimitMallocHeapSize controls the size in bytes of the heap used by the malloc()
+    // and free() device system calls.
+    if (mallocHeapSize >= 0) {
+      setCudaLimit(cudaLimitMallocHeapSize, "cudaLimitMallocHeapSize", mallocHeapSize);
+    }
+    if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+      // cudaLimitDevRuntimeSyncDepth controls the maximum nesting depth of a grid at which
+      // a thread can safely call cudaDeviceSynchronize().
+      if (devRuntimeSyncDepth >= 0) {
+        setCudaLimit(cudaLimitDevRuntimeSyncDepth, "cudaLimitDevRuntimeSyncDepth", devRuntimeSyncDepth);
+      }
+      // cudaLimitDevRuntimePendingLaunchCount controls the maximum number of outstanding
+      // device runtime launches that can be made from the current device.
+      if (devRuntimePendingLaunchCount >= 0) {
+        setCudaLimit(cudaLimitDevRuntimePendingLaunchCount,
+                     "cudaLimitDevRuntimePendingLaunchCount",
+                     devRuntimePendingLaunchCount);
+      }
+    }
+
+    size_t value;
+    log << "CUDA limits\n";
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitPrintfFifoSize));
+    log << "  printf buffer size:        " << std::setw(10) << value / (1 << 20) << " MB\n";
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitStackSize));
+    log << "  stack size:                " << std::setw(10) << value / (1 << 10) << " kB\n";
+    cudaCheck(cudaDeviceGetLimit(&value, cudaLimitMallocHeapSize));
+    log << "  malloc heap size:          " << std::setw(10) << value / (1 << 20) << " MB\n";
+    if ((properties.major > 3) or (properties.major == 3 and properties.minor >= 5)) {
+      cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimeSyncDepth));
+      log << "  runtime sync depth:           " << std::setw(10) << value << '\n';
+      cudaCheck(cudaDeviceGetLimit(&value, cudaLimitDevRuntimePendingLaunchCount));
+      log << "  runtime pending launch count: " << std::setw(10) << value << '\n';
+    }
+    log << '\n';
+  }
+  log << "\n";
+
+  // Make sure the caching allocators and stream/event caches are constructed before declaring successful construction
+  if constexpr (cudautils::allocator::useCaching) {
+    cudautils::allocator::getCachingDeviceAllocator();
+    cudautils::allocator::getCachingHostAllocator();
+  }
+  cudautils::getCUDAEventCache().clear();
+  cudautils::getCUDAStreamCache().clear();
+
+  log << "CUDAService fully initialized";
+  enabled_ = true;
+
+  // Preallocate buffers if asked to
+  auto const& allocator = config.getUntrackedParameter<edm::ParameterSet>("allocator");
+  devicePreallocate(numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
+  hostPreallocate(allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
+}
+
+CUDAService::~CUDAService() {
+  if (enabled_) {
+    // Explicitly destruct the allocator before the device resets below
+    if constexpr (cudautils::allocator::useCaching) {
+      cudautils::allocator::getCachingDeviceAllocator().FreeAllCached();
+      cudautils::allocator::getCachingHostAllocator().FreeAllCached();
+    }
+    cudautils::getCUDAEventCache().clear();
+    cudautils::getCUDAStreamCache().clear();
+
+    for (int i = 0; i < numberOfDevices_; ++i) {
+      cudaCheck(cudaSetDevice(i));
+      cudaCheck(cudaDeviceSynchronize());
+      // Explicitly destroys and cleans up all resources associated with the current device in the
+      // current process. Any subsequent API call to this device will reinitialize the device.
+      // Useful to check for memory leaks with `cuda-memcheck --tool memcheck --leak-check full`.
+      cudaDeviceReset();
+    }
+  }
+}
+
+void CUDAService::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.addUntracked<bool>("enabled", true);
+
+  edm::ParameterSetDescription limits;
+  limits.addUntracked<int>("cudaLimitPrintfFifoSize", -1)
+      ->setComment("Size in bytes of the shared FIFO used by the printf() device system call.");
+  limits.addUntracked<int>("cudaLimitStackSize", -1)->setComment("Stack size in bytes of each GPU thread.");
+  limits.addUntracked<int>("cudaLimitMallocHeapSize", -1)
+      ->setComment("Size in bytes of the heap used by the malloc() and free() device system calls.");
+  limits.addUntracked<int>("cudaLimitDevRuntimeSyncDepth", -1)
+      ->setComment("Maximum nesting depth of a grid at which a thread can safely call cudaDeviceSynchronize().");
+  limits.addUntracked<int>("cudaLimitDevRuntimePendingLaunchCount", -1)
+      ->setComment("Maximum number of outstanding device runtime launches that can be made from the current device.");
+  desc.addUntracked<edm::ParameterSetDescription>("limits", limits)
+      ->setComment(
+          "See the documentation of cudaDeviceSetLimit for more information.\nSetting any of these options to -1 keeps "
+          "the default value.");
+
+  edm::ParameterSetDescription allocator;
+  allocator.addUntracked<std::vector<unsigned int> >("devicePreallocate", std::vector<unsigned int>{})
+      ->setComment("Preallocates buffers of given bytes on all devices");
+  allocator.addUntracked<std::vector<unsigned int> >("hostPreallocate", std::vector<unsigned int>{})
+      ->setComment("Preallocates buffers of given bytes on the host");
+  desc.addUntracked<edm::ParameterSetDescription>("allocator", allocator);
+
+  descriptions.add("CUDAService", desc);
+}
+
+int CUDAService::deviceWithMostFreeMemory() const {
+  // save the current device
+  int currentDevice;
+  cudaCheck(cudaGetDevice(&currentDevice));
+
+  size_t maxFreeMemory = 0;
+  int device = -1;
+  for (int i = 0; i < numberOfDevices_; ++i) {
+    size_t freeMemory, totalMemory;
+    cudaSetDevice(i);
+    cudaMemGetInfo(&freeMemory, &totalMemory);
+    edm::LogPrint("CUDAService") << "CUDA device " << i << ": " << freeMemory / (1 << 20) << " MB free / "
+                                 << totalMemory / (1 << 20) << " MB total memory";
+    if (freeMemory > maxFreeMemory) {
+      maxFreeMemory = freeMemory;
+      device = i;
+    }
+  }
+  // restore the current device
+  cudaCheck(cudaSetDevice(currentDevice));
+  return device;
+}
diff --git a/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc b/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc
new file mode 100644
index 0000000000000..a0bbc503c3451
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/src/numberOfCUDADevices.cc
@@ -0,0 +1,8 @@
+#include "HeterogeneousCore/CUDAServices/interface/numberOfCUDADevices.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+int numberOfCUDADevices() {
+  edm::Service<CUDAService> cs;
+  return cs->enabled() ? cs->numberOfDevices() : 0;
+}
diff --git a/HeterogeneousCore/CUDAServices/test/BuildFile.xml b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
new file mode 100644
index 0000000000000..8697cb61fb40a
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/BuildFile.xml
@@ -0,0 +1,5 @@
+<use  name="HeterogeneousCore/CUDAServices"/>
+<use  name="cuda"/>
+<bin  file="testCUDAService.cpp test_main.cpp" name="testCUDAService">
+  <use name="catch2"/>
+</bin>
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
new file mode 100644
index 0000000000000..88e9508b7206c
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -0,0 +1,126 @@
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <string>
+#include <utility>
+
+#include <cuda_runtime_api.h>
+
+#include "catch.hpp"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+
+namespace {
+  CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
+    auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
+    CUDAService::fillDescriptions(desc);
+    desc.validate(ps, "CUDAService");
+    return CUDAService(ps, ar);
+  }
+}  // namespace
+
+TEST_CASE("Tests of CUDAService", "[CUDAService]") {
+  edm::ActivityRegistry ar;
+
+  // Test setup: check if a simple CUDA runtime API call fails:
+  // if so, skip the test with the CUDAService enabled
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount(&deviceCount);
+
+  if (ret != cudaSuccess) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString(ret) << ". Running only tests not requiring devices.");
+  }
+
+  SECTION("CUDAService enabled") {
+    edm::ParameterSet ps;
+    ps.addUntrackedParameter("enabled", true);
+    SECTION("Enabled only if there are CUDA capable GPUs") {
+      auto cs = makeCUDAService(ps, ar);
+      if (deviceCount <= 0) {
+        REQUIRE(cs.enabled() == false);
+        WARN("CUDAService is disabled as there are no CUDA GPU devices");
+      } else {
+        REQUIRE(cs.enabled() == true);
+        INFO("CUDAService is enabled");
+      }
+    }
+
+    if (deviceCount <= 0) {
+      return;
+    }
+
+    auto cs = makeCUDAService(ps, ar);
+
+    SECTION("CUDA Queries") {
+      int driverVersion = 0, runtimeVersion = 0;
+      ret = cudaDriverGetVersion(&driverVersion);
+      if (ret != cudaSuccess) {
+        FAIL("Unable to query the CUDA driver version from the CUDA runtime API: (" << ret << ") "
+                                                                                    << cudaGetErrorString(ret));
+      }
+      ret = cudaRuntimeGetVersion(&runtimeVersion);
+      if (ret != cudaSuccess) {
+        FAIL("Unable to query the CUDA runtime API version: (" << ret << ") " << cudaGetErrorString(ret));
+      }
+
+      WARN("CUDA Driver Version / Runtime Version: " << driverVersion / 1000 << "." << (driverVersion % 100) / 10
+                                                     << " / " << runtimeVersion / 1000 << "."
+                                                     << (runtimeVersion % 100) / 10);
+
+      // Test that the number of devices found by the service
+      // is the same as detected by the CUDA runtime API
+      REQUIRE(cs.numberOfDevices() == deviceCount);
+      WARN("Detected " << cs.numberOfDevices() << " CUDA Capable device(s)");
+
+      // Test that the compute capabilities of each device
+      // are the same as detected by the CUDA runtime API
+      for (int i = 0; i < deviceCount; ++i) {
+        cudaDeviceProp deviceProp;
+        ret = cudaGetDeviceProperties(&deviceProp, i);
+        if (ret != cudaSuccess) {
+          FAIL("Unable to query the CUDA properties for device " << i << " from the CUDA runtime API: (" << ret << ") "
+                                                                 << cudaGetErrorString(ret));
+        }
+
+        REQUIRE(deviceProp.major == cs.computeCapability(i).first);
+        REQUIRE(deviceProp.minor == cs.computeCapability(i).second);
+        INFO("Device " << i << ": " << deviceProp.name << "\n CUDA Capability Major/Minor version number: "
+                       << deviceProp.major << "." << deviceProp.minor);
+      }
+    }
+
+    SECTION("CUDAService device free memory") {
+      size_t mem = 0;
+      int dev = -1;
+      for (int i = 0; i < deviceCount; ++i) {
+        size_t free, tot;
+        cudaSetDevice(i);
+        cudaMemGetInfo(&free, &tot);
+        WARN("Device " << i << " memory total " << tot << " free " << free);
+        if (free > mem) {
+          mem = free;
+          dev = i;
+        }
+      }
+      WARN("Device with most free memory " << dev << "\n"
+                                           << "     as given by CUDAService " << cs.deviceWithMostFreeMemory());
+    }
+  }
+
+  SECTION("Force to be disabled") {
+    edm::ParameterSet ps;
+    ps.addUntrackedParameter("enabled", false);
+    auto cs = makeCUDAService(ps, ar);
+    REQUIRE(cs.enabled() == false);
+    REQUIRE(cs.numberOfDevices() == 0);
+  }
+
+  //Fake the end-of-job signal.
+  ar.postEndJobSignal_();
+}
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.py b/HeterogeneousCore/CUDAServices/test/testCUDAService.py
new file mode 100644
index 0000000000000..06edefcac2b66
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.py
@@ -0,0 +1,13 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process( "TEST" )
+
+process.load('FWCore.MessageService.MessageLogger_cfi')
+process.load('HeterogeneousCore.CUDAServices.CUDAService_cfi')
+process.MessageLogger.categories.append("CUDAService")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet(
+    input = cms.untracked.int32( 0 )
+)
diff --git a/HeterogeneousCore/CUDAServices/test/test_main.cc b/HeterogeneousCore/CUDAServices/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDAServices/test/test_main.cpp b/HeterogeneousCore/CUDAServices/test/test_main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/test/test_main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
new file mode 100644
index 0000000000000..112c200812d98
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -0,0 +1,3 @@
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="rootcore"/>
diff --git a/HeterogeneousCore/CUDATest/interface/CUDAThing.h b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
new file mode 100644
index 0000000000000..1ef6c2a7238cc
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
@@ -0,0 +1,18 @@
+#ifndef HeterogeneousCore_CUDATest_CUDAThing_H
+#define HeterogeneousCore_CUDATest_CUDAThing_H
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+class CUDAThing {
+public:
+  CUDAThing() = default;
+  CUDAThing(cudautils::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
+
+  const float *get() const { return ptr_.get(); }
+
+private:
+  cudautils::device::unique_ptr<float[]> ptr_;
+  ;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
new file mode 100644
index 0000000000000..b53d247aa6129
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -0,0 +1,9 @@
+<library file="*.cc *.cu" name="HeterogeneousCoreCUDATestPlugins">
+  <flags EDM_PLUGIN="1"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
+</library>
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
new file mode 100644
index 0000000000000..6d708cb0833af
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
@@ -0,0 +1,80 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDAnalyzer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+
+#include "TestCUDAAnalyzerGPUKernel.h"
+
+class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
+public:
+  explicit TestCUDAAnalyzerGPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAAnalyzerGPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void analyze(edm::StreamID, const edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+  void endJob() override;
+
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  double minValue_;
+  double maxValue_;
+  std::unique_ptr<TestCUDAAnalyzerGPUKernel> gpuAlgo_;
+};
+
+TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(const edm::ParameterSet& iConfig)
+    : label_(iConfig.getParameter<std::string>("@module_label")),
+      srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
+      minValue_(iConfig.getParameter<double>("minValue")),
+      maxValue_(iConfig.getParameter<double>("maxValue")) {
+  edm::Service<CUDAService> cs;
+  if (cs->enabled()) {
+    auto streamPtr = cudautils::getCUDAStreamCache().getCUDAStream();
+    gpuAlgo_ = std::make_unique<TestCUDAAnalyzerGPUKernel>(streamPtr.get());
+  }
+}
+
+void TestCUDAAnalyzerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAProduct<CUDAThing>.");
+  desc.add<double>("minValue", -1e308);
+  desc.add<double>("maxValue", 1e308);
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDAnalyzer is part of the TestCUDAProducer* family. It models a GPU analyzer.");
+}
+
+void TestCUDAAnalyzerGPU::analyze(edm::StreamID, const edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::analyze begin event "
+                                          << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContextAnalyze ctx{in};
+  const CUDAThing& input = ctx.get(in);
+  gpuAlgo_->analyzeAsync(input.get(), ctx.stream());
+
+  edm::LogVerbatim("TestCUDAAnalyzerGPU")
+      << label_ << " TestCUDAAnalyzerGPU::analyze end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAAnalyzerGPU::endJob() {
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob begin";
+
+  auto streamPtr = cudautils::getCUDAStreamCache().getCUDAStream();
+  auto value = gpuAlgo_->value(streamPtr.get());
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << "  accumulated value " << value;
+  assert(minValue_ <= value && value <= maxValue_);
+
+  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob end";
+}
+
+DEFINE_FWK_MODULE(TestCUDAAnalyzerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
new file mode 100644
index 0000000000000..4d4cca09e4668
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
@@ -0,0 +1,46 @@
+#include "TestCUDAAnalyzerGPUKernel.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace {
+  __global__ void analyze(const float *input, float *sum, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numElements) {
+      atomicAdd(sum + i, input[i]);
+    }
+  }
+
+  __global__ void sum(const float *input, float *output, int numElements) {
+    float val = 0.f;
+    for (int i = 0; i < numElements; ++i) {
+      val += input[i];
+    }
+    *output = val;
+  }
+}  // namespace
+
+TestCUDAAnalyzerGPUKernel::TestCUDAAnalyzerGPUKernel(cudaStream_t stream) {
+  sum_ = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
+  cudautils::memsetAsync(sum_, 0, NUM_VALUES, stream);
+  // better to synchronize since there is no guarantee that the stream
+  // of analyzeAsync() would be otherwise synchronized with this one
+  cudaCheck(cudaStreamSynchronize(stream));
+}
+
+void TestCUDAAnalyzerGPUKernel::analyzeAsync(const float *d_input, cudaStream_t stream) const {
+  analyze<<<int(ceil(float(NUM_VALUES) / 256)), 256, 0, stream>>>(d_input, sum_.get(), NUM_VALUES);
+}
+
+float TestCUDAAnalyzerGPUKernel::value(cudaStream_t stream) const {
+  auto accumulator = cudautils::make_device_unique<float>(stream);
+  auto h_accumulator = cudautils::make_host_unique<float>(stream);
+  sum<<<1, 1, 0, stream>>>(sum_.get(), accumulator.get(), NUM_VALUES);
+  cudautils::copyAsync(h_accumulator, accumulator, stream);
+  // need to synchronize
+  cudaCheck(cudaStreamSynchronize(stream));
+  return *h_accumulator;
+}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
new file mode 100644
index 0000000000000..6854ba8d61af7
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
+#define HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <cuda_runtime.h>
+
+class TestCUDAAnalyzerGPUKernel {
+public:
+  static constexpr int NUM_VALUES = 4000;
+
+  TestCUDAAnalyzerGPUKernel(cudaStream_t stream);
+  ~TestCUDAAnalyzerGPUKernel() = default;
+
+  // returns (owning) pointer to device memory
+  void analyzeAsync(const float* d_input, cudaStream_t stream) const;
+  float value(cudaStream_t stream) const;
+
+private:
+  mutable cudautils::device::unique_ptr<float[]> sum_;  // all writes are atomic in CUDA
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
new file mode 100644
index 0000000000000..bb19e2a3d7807
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
@@ -0,0 +1,67 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include <chrono>
+#include <random>
+#include <thread>
+
+class TestCUDAProducerCPU : public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerCPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+private:
+  std::string label_;
+  edm::EDGetTokenT<int> srcToken_;
+  edm::EDPutTokenT<int> dstToken_;
+};
+
+TestCUDAProducerCPU::TestCUDAProducerCPU(const edm::ParameterSet& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")}, dstToken_{produces<int>()} {
+  auto srcTag = iConfig.getParameter<edm::InputTag>("src");
+  if (!srcTag.label().empty()) {
+    srcToken_ = consumes<int>(srcTag);
+  }
+}
+
+void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Optional source of another TestCUDAProducerCPU.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a CPU algorithm.");
+}
+
+void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerCPU")
+      << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
+
+  int input = 0;
+  if (!srcToken_.isUninitialized()) {
+    input = iEvent.get(srcToken_);
+  }
+
+  std::random_device r;
+  std::mt19937 gen(r());
+  auto dist = std::uniform_real_distribution<>(0.2, 1.5);
+  auto dur = dist(gen);
+  edm::LogVerbatim("TestCUDAProducerCPU")
+      << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
+  std::this_thread::sleep_for(std::chrono::seconds(1) * dur);
+
+  const unsigned int output = input + id * 100 + iEvent.id().event();
+
+  iEvent.emplace(dstToken_, output);
+
+  edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event()
+                                          << " stream " << id << " result " << output;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
new file mode 100644
index 0000000000000..e66bd3080c1e7
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -0,0 +1,58 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPU : public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerGPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+};
+
+TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig)
+    : label_(iConfig.getParameter<std::string>("@module_label")),
+      srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
+      dstToken_(produces<CUDAProduct<CUDAThing>>()) {}
+
+void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAProduct<CUDAThing>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
+      "algorithm in the chain of the GPU EDProducers. Produces CUDAProduct<CUDAThing>.");
+}
+
+void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event "
+                                          << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContextProduce ctx{in};
+  const CUDAThing& input = ctx.get(in);
+
+  ctx.emplace(iEvent, dstToken_, CUDAThing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
+
+  edm::LogVerbatim("TestCUDAProducerGPU")
+      << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
new file mode 100644
index 0000000000000..74e5af7c46baf
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -0,0 +1,90 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUEW : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUEW() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(const edm::Event& iEvent,
+               const edm::EventSetup& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+  CUDAContextState ctxState_;
+  cudautils::device::unique_ptr<float[]> devicePtr_;
+  cudautils::host::noncached::unique_ptr<float> hostData_;
+};
+
+TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")},
+      srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<CUDAProduct<CUDAThing>>()} {
+  edm::Service<CUDAService> cs;
+  if (cs->enabled()) {
+    hostData_ = cudautils::make_host_noncached_unique<float>();
+  }
+}
+
+void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag());
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent,
+                                    const edm::EventSetup& iSetup,
+                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event "
+                                            << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
+  const CUDAThing& input = ctx.get(in);
+
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
+  // Mimick the need to transfer some of the GPU data back to CPU to
+  // be used for something within this module, or to be put in the
+  // event.
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
+                                            << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUEW")
+      << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream "
+      << iEvent.streamID() << " 10th element " << *hostData_;
+
+  CUDAScopedContextProduce ctx{ctxState_};
+
+  ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
+
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event "
+                                            << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUEW);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
new file mode 100644
index 0000000000000..0c8aad0931f15
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEWTask.cc
@@ -0,0 +1,128 @@
+#include "FWCore/Concurrency/interface/FunctorTask.h"
+#include "FWCore/Concurrency/interface/WaitingTask.h"
+#include "FWCore/Concurrency/interface/WaitingTaskHolder.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextState.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+#include <thread>
+
+class TestCUDAProducerGPUEWTask : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUEWTask(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUEWTask() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(const edm::Event& iEvent,
+               const edm::EventSetup& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+private:
+  void addSimpleWork(edm::EventNumber_t eventID, edm::StreamID streamID, CUDAScopedContextTask& ctx);
+
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+  CUDAContextState ctxState_;
+  cudautils::device::unique_ptr<float[]> devicePtr_;
+  cudautils::host::noncached::unique_ptr<float> hostData_;
+};
+
+TestCUDAProducerGPUEWTask::TestCUDAProducerGPUEWTask(const edm::ParameterSet& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")},
+      srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<CUDAProduct<CUDAThing>>()} {
+  edm::Service<CUDAService> cs;
+  if (cs->enabled()) {
+    hostData_ = cudautils::make_host_noncached_unique<float>();
+  }
+}
+
+void TestCUDAProducerGPUEWTask::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag());
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void TestCUDAProducerGPUEWTask::acquire(const edm::Event& iEvent,
+                                        const edm::EventSetup& iSetup,
+                                        edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::acquire begin event "
+                                                << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContextAcquire ctx{in, waitingTaskHolder, ctxState_};
+
+  const CUDAThing& input = ctx.get(in);
+
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
+  // Mimick the need to transfer some of the GPU data back to CPU to
+  // be used for something within this module, or to be put in the
+  // event.
+  cudaCheck(
+      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+  // Push a task to run addSimpleWork() after the asynchronous work
+  // (and acquire()) has finished instead of produce()
+  ctx.pushNextTask([iev = iEvent.id().event(), istr = iEvent.streamID(), this](CUDAScopedContextTask ctx) {
+    addSimpleWork(iev, istr, ctx);
+  });
+
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::acquire end event "
+                                                << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUEWTask::addSimpleWork(edm::EventNumber_t eventID,
+                                              edm::StreamID streamID,
+                                              CUDAScopedContextTask& ctx) {
+  if (*hostData_ < 13) {
+    edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+        << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork begin event " << eventID << " stream " << streamID
+        << " 10th element " << *hostData_ << " not satisfied, queueing more work";
+    cudaCheck(
+        cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
+
+    ctx.pushNextTask([eventID, streamID, this](CUDAScopedContextTask ctx) { addSimpleWork(eventID, streamID, ctx); });
+    gpuAlgo_.runSimpleAlgo(devicePtr_.get(), ctx.stream());
+    edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+        << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork end event " << eventID << " stream " << streamID;
+  } else {
+    edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+        << label_ << " TestCUDAProducerGPUEWTask::addSimpleWork event " << eventID << " stream " << streamID
+        << " 10th element " << *hostData_ << " not queueing more work";
+  }
+}
+
+void TestCUDAProducerGPUEWTask::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask")
+      << label_ << " TestCUDAProducerGPUEWTask::produce begin event " << iEvent.id().event() << " stream "
+      << iEvent.streamID() << " 10th element " << *hostData_;
+  if (*hostData_ != 13) {
+    throw cms::Exception("Assert") << "Expecting 10th element to be 13, got " << *hostData_;
+  }
+
+  CUDAScopedContextProduce ctx{ctxState_};
+
+  ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
+
+  edm::LogVerbatim("TestCUDAProducerGPUEWTask") << label_ << " TestCUDAProducerGPUEWTask::produce end event "
+                                                << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUEWTask);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..12f4f4530e84f
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,56 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUFirst : public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUFirst() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID stream, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+private:
+  std::string label_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
+};
+
+TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig)
+    : label_(iConfig.getParameter<std::string>("@module_label")) {
+  produces<CUDAProduct<CUDAThing>>();
+}
+
+void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in "
+      "the chain of the GPU EDProducers. Produces CUDA<ProductCUDAThing>.");
+}
+
+void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
+                                       edm::Event& iEvent,
+                                       const edm::EventSetup& iSetup) const {
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  CUDAScopedContextProduce ctx{streamID};
+
+  cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
+  iEvent.put(ctx.wrap(CUDAThing(std::move(output))));
+
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUFirst);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
new file mode 100644
index 0000000000000..585139606df98
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -0,0 +1,131 @@
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+namespace {
+  template <typename T>
+  __global__ void vectorAddConstant(T *a, T b, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numElements) {
+      a[i] += b;
+    }
+  }
+
+  template <typename T>
+  __global__ void vectorAdd(const T *a, const T *b, T *c, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < numElements) {
+      c[i] = a[i] + b[i];
+    }
+  }
+
+  template <typename T>
+  __global__ void vectorProd(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (row < numElements && col < numElements) {
+      c[row * numElements + col] = a[row] * b[col];
+    }
+  }
+
+  template <typename T>
+  __global__ void matrixMul(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+    int col = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (row < numElements && col < numElements) {
+      T tmp = 0;
+      for (int i = 0; i < numElements; ++i) {
+        tmp += a[row * numElements + i] * b[i * numElements + col];
+      }
+      c[row * numElements + col] = tmp;
+    }
+  }
+
+  template <typename T>
+  __global__ void matrixMulVector(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (row < numElements) {
+      T tmp = 0;
+      for (int i = 0; i < numElements; ++i) {
+        tmp += a[row * numElements + i] * b[i];
+      }
+      c[row] = tmp;
+    }
+  }
+}  // namespace
+
+cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string &label,
+                                                                          const float *d_input,
+                                                                          cudaStream_t stream) const {
+  // First make the sanity check
+  if (d_input != nullptr) {
+    auto h_check = std::make_unique<float[]>(NUM_VALUES);
+    cudaCheck(cudaMemcpyAsync(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+    for (int i = 0; i < NUM_VALUES; ++i) {
+      if (h_check[i] != i) {
+        throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
+                                       << h_check[i];
+      }
+    }
+  }
+
+  auto h_a = cudautils::make_host_unique<float[]>(NUM_VALUES, stream);
+  auto h_b = cudautils::make_host_unique<float[]>(NUM_VALUES, stream);
+
+  for (auto i = 0; i < NUM_VALUES; i++) {
+    h_a[i] = i;
+    h_b[i] = i * i;
+  }
+
+  auto d_a = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_b = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
+
+  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
+  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
+
+  int threadsPerBlock{32};
+  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
+
+  auto d_c = cudautils::make_device_unique<float[]>(NUM_VALUES, stream);
+  auto current_device = cudautils::currentDevice();
+  cudautils::LogVerbatim("TestHeterogeneousEDProducerGPU")
+      << "  " << label << " GPU launching kernels device " << current_device << " CUDA stream " << stream;
+  vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  auto d_ma = cudautils::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_mb = cudautils::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  auto d_mc = cudautils::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
+  dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
+  dim3 blocksPerGrid3{1, 1};
+  if (NUM_VALUES * NUM_VALUES > 32) {
+    threadsPerBlock3.x = 32;
+    threadsPerBlock3.y = 32;
+    blocksPerGrid3.x = ceil(double(NUM_VALUES) / double(threadsPerBlock3.x));
+    blocksPerGrid3.y = ceil(double(NUM_VALUES) / double(threadsPerBlock3.y));
+  }
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES);
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES);
+  matrixMul<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES);
+
+  matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  cudautils::LogVerbatim("TestHeterogeneousEDProducerGPU")
+      << "  " << label << " GPU kernels launched, returning return pointer device " << current_device << " CUDA stream "
+      << stream;
+  return d_a;
+}
+
+void TestCUDAProducerGPUKernel::runSimpleAlgo(float *d_data, cudaStream_t stream) const {
+  int threadsPerBlock{32};
+  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
+  vectorAddConstant<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data, 1.0f, NUM_VALUES);
+}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
new file mode 100644
index 0000000000000..a332755b390a3
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
@@ -0,0 +1,37 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+#define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+
+#include <string>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+/**
+ * This class models the actual CUDA implementation of an algorithm.
+ *
+ * Memory is allocated dynamically with the allocator in cudautils.
+ *
+ * The algorithm is intended to waste time with large matrix
+ * operations so that the asynchronous nature of the CUDA integration
+ * becomes visible with debug prints.
+ */
+class TestCUDAProducerGPUKernel {
+public:
+  static constexpr int NUM_VALUES = 4000;
+
+  TestCUDAProducerGPUKernel() = default;
+  ~TestCUDAProducerGPUKernel() = default;
+
+  // returns (owning) pointer to device memory
+  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label, cudaStream_t stream) const {
+    return runAlgo(label, nullptr, stream);
+  }
+  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label,
+                                                 const float* d_input,
+                                                 cudaStream_t stream) const;
+
+  void runSimpleAlgo(float* d_data, cudaStream_t stream) const;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
new file mode 100644
index 0000000000000..168ac1daa14b9
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -0,0 +1,89 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUtoCPU : public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUtoCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void acquire(const edm::Event& iEvent,
+               const edm::EventSetup& iSetup,
+               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<int> dstToken_;
+  cudautils::host::unique_ptr<float[]> buffer_;
+};
+
+TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig)
+    : label_{iConfig.getParameter<std::string>("@module_label")},
+      srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+      dstToken_{produces<int>()} {}
+
+void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDAProduct<CUDAThing>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment(
+      "This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of "
+      "the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
+}
+
+void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent,
+                                       const edm::EventSetup& iSetup,
+                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
+  const CUDAThing& device = ctx.get(in);
+
+  buffer_ = cudautils::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
+  // Enqueue async copy, continue in produce once finished
+  cudaCheck(cudaMemcpyAsync(buffer_.get(),
+                            device.get(),
+                            TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float),
+                            cudaMemcpyDeviceToHost,
+                            ctx.stream()));
+
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event "
+                                               << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  int counter = 0;
+  for (int i = 0; i < TestCUDAProducerGPUKernel::NUM_VALUES; ++i) {
+    counter += buffer_[i];
+  }
+  buffer_.reset();  // not so nice, but no way around?
+
+  iEvent.emplace(dstToken_, counter);
+
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU")
+      << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream "
+      << iEvent.streamID() << " result " << counter;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUtoCPU);
diff --git a/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
new file mode 100644
index 0000000000000..421d01c07ea46
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod1CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
new file mode 100644
index 0000000000000..cf50287bcb15e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod1CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de6ed9b0ff179
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod1FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod1CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
new file mode 100644
index 0000000000000..72221ade422fa
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU as _prod1CPU
+from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
+from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA as _prod1FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod1 = SwitchProducerCUDA(
+    cpu = _prod1CPU.clone(),
+    cuda = _prod1FromCUDA.clone()
+)
+
+prod1Task = cms.Task(
+    prod1CUDA,
+    prod1
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
new file mode 100644
index 0000000000000..3353c774f8f27
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod5CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
new file mode 100644
index 0000000000000..2eb97358a1b4c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod5CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
new file mode 100644
index 0000000000000..c25aa2c5a1043
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod5FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
new file mode 100644
index 0000000000000..3b5e90d497641
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod5CPU_cfi import prod5CPU as _prod5CPU
+from HeterogeneousCore.CUDATest.prod5CUDA_cfi import prod5CUDA
+from HeterogeneousCore.CUDATest.prod5FromCUDA_cfi import prod5FromCUDA as _prod5FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod5 = SwitchProducerCUDA(
+    cpu = _prod5CPU.clone(),
+    cuda = _prod5FromCUDA.clone()
+)
+
+prod5Task = cms.Task(
+    prod5CUDA,
+    prod5
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
new file mode 100644
index 0000000000000..a6fadc65195db
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod6CPU = _testCUDAProducerCPU.clone(src = "prod5")
diff --git a/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
new file mode 100644
index 0000000000000..5d8f9d7b0e46b
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEWTask_cfi import testCUDAProducerGPUEWTask as _testCUDAProducerGPUEWTask
+prod6CUDA = _testCUDAProducerGPUEWTask.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de894a5f32617
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod6FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod6CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
new file mode 100644
index 0000000000000..fe05a11d4ce5a
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
@@ -0,0 +1,18 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod6CPU_cfi import prod6CPU as _prod6CPU
+from HeterogeneousCore.CUDATest.prod6CUDA_cfi import prod6CUDA
+from HeterogeneousCore.CUDATest.prod6FromCUDA_cfi import prod6FromCUDA as _prod6FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod6 = SwitchProducerCUDA(
+    cpu = _prod6CPU.clone(),
+    cuda = _prod6FromCUDA.clone()
+)
+
+prod6Task = cms.Task(
+    prod6CUDA,
+    prod6
+)
+
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
new file mode 100644
index 0000000000000..33d9bba2bb9b2
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -0,0 +1,3 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes_def.xml b/HeterogeneousCore/CUDATest/src/classes_def.xml
new file mode 100644
index 0000000000000..bece1ece62a7b
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+    <class name="CUDAProduct<CUDAThing>" persistent="false"/>
+    <class name="edm::Wrapper<CUDAProduct<CUDAThing>>" persistent="false"/>
+</lcgdict>
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
new file mode 100644
index 0000000000000..3287d65c14470
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -0,0 +1,10 @@
+<bin file="test*.cc" name="testHeterogeneousCoreCUDATest">
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+</bin>
+
+<bin file="TestCUDATest.cc">
+  <flags TEST_RUNNER_ARGS=" /bin/bash HeterogeneousCore/CUDATest/test runtests.sh"/>
+  <use name="FWCore/Utilities"/>
+</bin>
diff --git a/HeterogeneousCore/CUDATest/test/TestCUDATest.cc b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
new file mode 100644
index 0000000000000..b2991bd18ae57
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
@@ -0,0 +1,3 @@
+#include "FWCore/Utilities/interface/TestHelper.h"
+
+RUNTEST()
diff --git a/HeterogeneousCore/CUDATest/test/runtests.sh b/HeterogeneousCore/CUDATest/test/runtests.sh
new file mode 100755
index 0000000000000..6817aa8d7ffab
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/runtests.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+function die { echo Failure $1: status $2 ; exit $2 ; }
+
+pushd ${LOCAL_TMP_DIR}
+
+  echo "*************************************************"
+  echo "CUDA producer configuration with SwitchProducer"
+  cmsRun ${LOCAL_TEST_DIR}/testCUDASwitch_cfg.py || die "cmsRun testCUDASwitch_cfg.py 1" $?
+
+popd
diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
new file mode 100644
index 0000000000000..2e213c8a03ede
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
@@ -0,0 +1,103 @@
+import FWCore.ParameterSet.Config as cms
+
+silent = True
+#silent = False
+
+#includeAnalyzer = True
+includeAnalyzer = False
+
+process = cms.Process("Test")
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(3) )
+if not silent:
+    process.maxEvents.input = 10
+    process.MessageLogger.cerr.threshold = cms.untracked.string("INFO")
+    process.MessageLogger.cerr.INFO.limit = process.MessageLogger.cerr.default.limit
+
+
+process.options = cms.untracked.PSet(
+#    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(0)
+)
+#process.Tracer = cms.Service("Tracer")
+
+# Flow diagram of the modules
+#
+#     1   5
+#    / \  |
+#   2  4  6
+#   |
+#   3
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+process.load("HeterogeneousCore.CUDATest.prod1Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod5Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod6Switch_cff")
+
+# GPU producers
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
+
+process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
+process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
+process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
+
+# CPU producers, switched with modules to copy data from GPU to CPU
+# (as "on demand" as any other EDProducer, i.e. according to
+# consumes() and prefetching). If a separate conversion step is needed
+# to get the same data formats as the CPU modules, those are then ones
+# that should be replaced-with here.
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
+process.prod2 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")
+)
+process.prod3 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod2"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")
+)
+process.prod4 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")
+)
+
+# GPU analyzer (optionally)
+from HeterogeneousCore.CUDATest.testCUDAAnalyzerGPU_cfi import testCUDAAnalyzerGPU
+process.anaCUDA = testCUDAAnalyzerGPU.clone(src="prod6CUDA")
+if silent:
+    process.anaCUDA.minValue = 2.3e7
+    process.anaCUDA.maxValue = 2.5e7
+
+process.out = cms.OutputModule("AsciiOutputModule",
+    outputCommands = cms.untracked.vstring(
+        "keep *_prod3_*_*",
+        "keep *_prod4_*_*",
+        "keep *_prod6_*_*",
+    ),
+    verbosity = cms.untracked.uint32(0),
+)
+
+process.prod2Task = cms.Task(process.prod2, process.prod2CUDA)
+process.prod3Task = cms.Task(process.prod3, process.prod3CUDA)
+process.prod4Task = cms.Task(process.prod4, process.prod4CUDA)
+
+process.t = cms.Task(
+    process.prod1Task,
+    process.prod2Task,
+    process.prod3Task,
+    process.prod4Task,
+    process.prod5Task,
+    process.prod6Task
+)
+process.p = cms.Path()
+if includeAnalyzer:
+    process.p += process.anaCUDA
+process.p.associate(process.t)
+process.ep = cms.EndPath(process.out)
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..e52b8e82ca9da
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,93 @@
+#include "catch.hpp"
+#include "FWCore/TestProcessor/interface/TestProcessor.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <iostream>
+
+static constexpr auto s_tag = "[TestCUDAProducerGPUFirst]";
+
+TEST_CASE("Standard checks of TestCUDAProducerGPUFirst", s_tag) {
+  const std::string baseConfig{
+      R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
+process.moduleToTest(process.toTest)
+)_"};
+
+  edm::test::TestProcessor::Config config{baseConfig};
+  SECTION("base configuration is OK") { REQUIRE_NOTHROW(edm::test::TestProcessor(config)); }
+
+  SECTION("No event data") {
+    // Calls produce(), so don't call without a GPU
+    if (not hasCUDADevices()) {
+      return;
+    }
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.test());
+  }
+
+  SECTION("beginJob and endJob only") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
+  }
+
+  SECTION("Run with no LuminosityBlocks") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
+  }
+
+  SECTION("LuminosityBlock with no Events") {
+    edm::test::TestProcessor tester(config);
+
+    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
+  }
+}
+
+TEST_CASE("TestCUDAProducerGPUFirst operation", s_tag) {
+  const std::string baseConfig{
+      R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
+process.moduleToTest(process.toTest)
+)_"};
+  edm::test::TestProcessor::Config config{baseConfig};
+
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+
+  SECTION("Produce") {
+    edm::test::TestProcessor tester{config};
+    auto event = tester.test();
+    auto prod = event.get<CUDAProduct<CUDAThing> >();
+    REQUIRE(prod->device() == defaultDevice);
+    auto ctx = CUDAScopedContextProduce(*prod);
+    const CUDAThing& thing = ctx.get(*prod);
+    const float* data = thing.get();
+    REQUIRE(data != nullptr);
+
+    float firstElements[10];
+    cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()));
+
+    std::cout << "Synchronizing with CUDA stream" << std::endl;
+    auto stream = prod->stream();
+    cudaCheck(cudaStreamSynchronize(stream));
+    std::cout << "Synchronized" << std::endl;
+    REQUIRE(firstElements[0] == 0.f);
+    REQUIRE(firstElements[1] == 1.f);
+    REQUIRE(firstElements[9] == 9.f);
+  }
+};
diff --git a/HeterogeneousCore/CUDATest/test/test_main.cc b/HeterogeneousCore/CUDATest/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
new file mode 100644
index 0000000000000..4528e0288d64f
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="cub"/>
+<use name="cuda"/>
+<use name="FWCore/Utilities"/>
+<use name="FWCore/MessageLogger"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
new file mode 100644
index 0000000000000..cc5b73b58f601
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h
@@ -0,0 +1,48 @@
+#ifndef HeterogeneousCore_CUDAUtilities_CUDAEventCache_h
+#define HeterogeneousCore_CUDAUtilities_CUDAEventCache_h
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
+
+class CUDAService;
+
+namespace cudautils {
+  class CUDAEventCache {
+  public:
+    using BareEvent = SharedEventPtr::element_type;
+
+    CUDAEventCache();
+
+    // Gets a (cached) CUDA event for the current device. The event
+    // will be returned to the cache by the shared_ptr destructor.
+    // This function is thread safe
+    SharedEventPtr getCUDAEvent();
+
+  private:
+    friend class ::CUDAService;
+    // intended to be called only from CUDAService destructor
+    void clear();
+
+    class Deleter {
+    public:
+      Deleter() = default;
+      Deleter(int d) : device_{d} {}
+      void operator()(cudaEvent_t event) const;
+
+    private:
+      int device_ = -1;
+    };
+
+    std::vector<edm::ReusableObjectHolder<BareEvent, Deleter>> cache_;
+  };
+
+  // Gets the global instance of a CUDAEventCache
+  // This function is thread safe
+  CUDAEventCache& getCUDAEventCache();
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h b/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h
new file mode 100644
index 0000000000000..c11cf399fb574
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h
@@ -0,0 +1,48 @@
+#ifndef HeterogeneousCore_CUDAUtilities_CUDAStreamCache_h
+#define HeterogeneousCore_CUDAUtilities_CUDAStreamCache_h
+
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
+
+class CUDAService;
+
+namespace cudautils {
+  class CUDAStreamCache {
+  public:
+    using BareStream = SharedStreamPtr::element_type;
+
+    CUDAStreamCache();
+
+    // Gets a (cached) CUDA stream for the current device. The stream
+    // will be returned to the cache by the shared_ptr destructor.
+    // This function is thread safe
+    SharedStreamPtr getCUDAStream();
+
+  private:
+    friend class ::CUDAService;
+    // intended to be called only from CUDAService destructor
+    void clear();
+
+    class Deleter {
+    public:
+      Deleter() = default;
+      Deleter(int d) : device_{d} {}
+      void operator()(cudaStream_t stream) const;
+
+    private:
+      int device_ = -1;
+    };
+
+    std::vector<edm::ReusableObjectHolder<BareStream, Deleter>> cache_;
+  };
+
+  // Gets the global instance of a CUDAStreamCache
+  // This function is thread safe
+  CUDAStreamCache& getCUDAStreamCache();
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
new file mode 100644
index 0000000000000..5299181929fd5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h
@@ -0,0 +1,98 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_MessageLogger_h
+#define HeterogeneousCore_CUDAUtilities_interface_MessageLogger_h
+
+#include <sstream>
+#include <string>
+
+namespace cudautils {
+
+  class MessageLogger {
+  public:
+    MessageLogger(std::string const& category) : category_(category) {}
+
+    MessageLogger(std::string&& category) : category_(std::move(category)) {}
+
+    ~MessageLogger() = default;
+
+    MessageLogger(MessageLogger const&) = delete;
+    MessageLogger(MessageLogger&&) = delete;
+    MessageLogger& operator=(MessageLogger const&) = delete;
+    MessageLogger& operator=(MessageLogger&&) = delete;
+
+    template <typename T>
+    MessageLogger& operator<<(T const& element) {
+      message_ << element;
+      return *this;
+    }
+
+  protected:
+    std::string category_;
+    std::stringstream message_;
+  };
+
+  class LogSystem : public MessageLogger {
+  public:
+    LogSystem(std::string const& category) : MessageLogger(category) {}
+    LogSystem(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogSystem();
+  };
+
+  class LogAbsolute : public MessageLogger {
+  public:
+    LogAbsolute(std::string const& category) : MessageLogger(category) {}
+    LogAbsolute(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogAbsolute();
+  };
+
+  class LogError : public MessageLogger {
+  public:
+    LogError(std::string const& category) : MessageLogger(category) {}
+    LogError(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogError();
+  };
+
+  class LogProblem : public MessageLogger {
+  public:
+    LogProblem(std::string const& category) : MessageLogger(category) {}
+    LogProblem(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogProblem();
+  };
+
+  class LogImportant : public MessageLogger {
+  public:
+    LogImportant(std::string const& category) : MessageLogger(category) {}
+    LogImportant(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogImportant();
+  };
+
+  class LogWarning : public MessageLogger {
+  public:
+    LogWarning(std::string const& category) : MessageLogger(category) {}
+    LogWarning(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogWarning();
+  };
+
+  class LogPrint : public MessageLogger {
+  public:
+    LogPrint(std::string const& category) : MessageLogger(category) {}
+    LogPrint(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogPrint();
+  };
+
+  class LogInfo : public MessageLogger {
+  public:
+    LogInfo(std::string const& category) : MessageLogger(category) {}
+    LogInfo(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogInfo();
+  };
+
+  class LogVerbatim : public MessageLogger {
+  public:
+    LogVerbatim(std::string const& category) : MessageLogger(category) {}
+    LogVerbatim(std::string&& category) : MessageLogger(std::move(category)) {}
+    ~LogVerbatim();
+  };
+
+}  // namespace cudautils
+
+#endif  // HeterogeneousCore_CUDAUtilities_interface_MessageLogger_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h b/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
new file mode 100644
index 0000000000000..385c8910acad8
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h
@@ -0,0 +1,28 @@
+#ifndef HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+#define HeterogeneousCore_CUDAUtilities_ScopedSetDevice_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  class ScopedSetDevice {
+  public:
+    explicit ScopedSetDevice(int newDevice) {
+      cudaCheck(cudaGetDevice(&prevDevice_));
+      cudaCheck(cudaSetDevice(newDevice));
+    }
+
+    ~ScopedSetDevice() {
+      // Intentionally don't check the return value to avoid
+      // exceptions to be thrown. If this call fails, the process is
+      // doomed anyway.
+      cudaSetDevice(prevDevice_);
+    }
+
+  private:
+    int prevDevice_;
+  };
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h b/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h
new file mode 100644
index 0000000000000..e9955782fe8c8
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h
@@ -0,0 +1,16 @@
+#ifndef HeterogeneousCore_CUDAUtilities_SharedEventPtr_h
+#define HeterogeneousCore_CUDAUtilities_SharedEventPtr_h
+
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  // cudaEvent_t itself is a typedef for a pointer, for the use with
+  // edm::ReusableObjectHolder the pointed-to type is more interesting
+  // to avoid extra layer of indirection
+  using SharedEventPtr = std::shared_ptr<std::remove_pointer_t<cudaEvent_t>>;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h b/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h
new file mode 100644
index 0000000000000..2b5be232f03d9
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h
@@ -0,0 +1,16 @@
+#ifndef HeterogeneousCore_CUDAUtilities_SharedStreamPtr_h
+#define HeterogeneousCore_CUDAUtilities_SharedStreamPtr_h
+
+#include <memory>
+#include <type_traits>
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  // cudaStream_t itself is a typedef for a pointer, for the use with
+  // edm::ReusableObjectHolder the pointed-to type is more interesting
+  // to avoid extra layer of indirection
+  using SharedStreamPtr = std::shared_ptr<std::remove_pointer_t<cudaStream_t>>;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
new file mode 100644
index 0000000000000..1c689f03ef831
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_device.h
@@ -0,0 +1,14 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_device_h
+#define HeterogeneousCore_CUDAUtilities_allocate_device_h
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  // Allocate device memory
+  void *allocate_device(int dev, size_t nbytes, cudaStream_t stream);
+
+  // Free device memory (to be called from unique_ptr)
+  void free_device(int device, void *ptr);
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
new file mode 100644
index 0000000000000..8913e8ce14c7f
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/allocate_host.h
@@ -0,0 +1,14 @@
+#ifndef HeterogeneousCore_CUDAUtilities_allocate_host_h
+#define HeterogeneousCore_CUDAUtilities_allocate_host_h
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  // Allocate pinned host memory (to be called from unique_ptr)
+  void *allocate_host(size_t nbytes, cudaStream_t stream);
+
+  // Free pinned host memory (to be called from unique_ptr)
+  void free_host(void *ptr);
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
new file mode 100644
index 0000000000000..97dd81b9ac14a
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -0,0 +1,48 @@
+#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
+#define HeterogeneousCore_CUDAUtilities_copyAsync_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+#include <type_traits>
+
+namespace cudautils {
+  // Single element
+  template <typename T>
+  inline void copyAsync(cudautils::device::unique_ptr<T>& dst,
+                        const cudautils::host::unique_ptr<T>& src,
+                        cudaStream_t stream) {
+    // Shouldn't compile for array types because of sizeof(T), but
+    // let's add an assert with a more helpful message
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyHostToDevice, stream));
+  }
+
+  template <typename T>
+  inline void copyAsync(cudautils::host::unique_ptr<T>& dst,
+                        const cudautils::device::unique_ptr<T>& src,
+                        cudaStream_t stream) {
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), sizeof(T), cudaMemcpyDeviceToHost, stream));
+  }
+
+  // Multiple elements
+  template <typename T>
+  inline void copyAsync(cudautils::device::unique_ptr<T[]>& dst,
+                        const cudautils::host::unique_ptr<T[]>& src,
+                        size_t nelements,
+                        cudaStream_t stream) {
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyHostToDevice, stream));
+  }
+
+  template <typename T>
+  inline void copyAsync(cudautils::host::unique_ptr<T[]>& dst,
+                        const cudautils::device::unique_ptr<T[]>& src,
+                        size_t nelements,
+                        cudaStream_t stream) {
+    cudaCheck(cudaMemcpyAsync(dst.get(), src.get(), nelements * sizeof(T), cudaMemcpyDeviceToHost, stream));
+  }
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
new file mode 100644
index 0000000000000..37aa62c17fffb
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h
@@ -0,0 +1,62 @@
+#ifndef HeterogeneousCore_CUDAUtilities_cudaCheck_h
+#define HeterogeneousCore_CUDAUtilities_cudaCheck_h
+
+// C++ standard headers
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+// CUDA headers
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+// CMSSW headers
+#include "FWCore/Utilities/interface/Likely.h"
+
+namespace cudautils {
+
+  [[noreturn]] inline void abortOnCudaError(const char* file,
+                                            int line,
+                                            const char* cmd,
+                                            const char* error,
+                                            const char* message,
+                                            const char* description = nullptr) {
+    std::ostringstream out;
+    out << "\n";
+    out << file << ", line " << line << ":\n";
+    out << "cudaCheck(" << cmd << ");\n";
+    out << error << ": " << message << "\n";
+    if (description)
+      out << description << "\n";
+    throw std::runtime_error(out.str());
+  }
+
+  inline bool cudaCheck_(
+      const char* file, int line, const char* cmd, CUresult result, const char* description = nullptr) {
+    if (LIKELY(result == CUDA_SUCCESS))
+      return true;
+
+    const char* error;
+    const char* message;
+    cuGetErrorName(result, &error);
+    cuGetErrorString(result, &message);
+    abortOnCudaError(file, line, cmd, error, message, description);
+    return false;
+  }
+
+  inline bool cudaCheck_(
+      const char* file, int line, const char* cmd, cudaError_t result, const char* description = nullptr) {
+    if (LIKELY(result == cudaSuccess))
+      return true;
+
+    const char* error = cudaGetErrorName(result);
+    const char* message = cudaGetErrorString(result);
+    abortOnCudaError(file, line, cmd, error, message, description);
+    return false;
+  }
+
+}  // namespace cudautils
+
+#define cudaCheck(ARG, ...) (cudautils::cudaCheck_(__FILE__, __LINE__, #ARG, (ARG), ##__VA_ARGS__))
+
+#endif  // HeterogeneousCore_CUDAUtilities_cudaCheck_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h b/HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h
new file mode 100644
index 0000000000000..37be02714747b
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h
@@ -0,0 +1,16 @@
+#ifndef HeterogenousCore_CUDAUtilities_cudaDeviceCount_h
+#define HeterogenousCore_CUDAUtilities_cudaDeviceCount_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  inline int cudaDeviceCount() {
+    int ndevices;
+    cudaCheck(cudaGetDeviceCount(&ndevices));
+    return ndevices;
+  }
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h b/HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h
new file mode 100644
index 0000000000000..f3f452dc31471
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h
@@ -0,0 +1,18 @@
+// The omission of #include guards is on purpose: it does make sense to #include
+// this file multiple times, setting a different value of GPU_DEBUG beforehand.
+
+#ifdef __CUDA_ARCH__
+#ifndef GPU_DEBUG
+// disable asserts
+#ifndef NDEBUG
+#define NDEBUG
+#endif
+#else
+// enable asserts
+#ifdef NDEBUG
+#undef NDEBUG
+#endif
+#endif
+#endif  // __CUDA_ARCH__
+
+#include <cassert>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h b/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h
new file mode 100644
index 0000000000000..2659bcbf1d95c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/currentDevice.h
@@ -0,0 +1,16 @@
+#ifndef HeterogenousCore_CUDAUtilities_currentDevice_h
+#define HeterogenousCore_CUDAUtilities_currentDevice_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  inline int currentDevice() {
+    int dev;
+    cudaCheck(cudaGetDevice(&dev));
+    return dev;
+  }
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
new file mode 100644
index 0000000000000..76c876acbf5b3
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
@@ -0,0 +1,100 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include "FWCore/Utilities/interface/Likely.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+
+namespace cudautils {
+  namespace device {
+    namespace impl {
+      // Additional layer of types to distinguish from host::unique_ptr
+      class DeviceDeleter {
+      public:
+        DeviceDeleter() = default;  // for edm::Wrapper
+        DeviceDeleter(int device) : device_{device} {}
+
+        void operator()(void *ptr) {
+          if (LIKELY(device_ >= 0)) {
+            free_device(device_, ptr);
+          }
+        }
+
+      private:
+        int device_ = -1;
+      };
+    }  // namespace impl
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
+
+    namespace impl {
+      template <typename T>
+      struct make_device_unique_selector {
+        using non_array = cudautils::device::unique_ptr<T>;
+      };
+      template <typename T>
+      struct make_device_unique_selector<T[]> {
+        using unbounded_array = cudautils::device::unique_ptr<T[]>;
+      };
+      template <typename T, size_t N>
+      struct make_device_unique_selector<T[N]> {
+        struct bounded_array {};
+      };
+    }  // namespace impl
+  }    // namespace device
+
+  template <typename T>
+  typename device::impl::make_device_unique_selector<T>::non_array make_device_unique(cudaStream_t stream) {
+    static_assert(std::is_trivially_constructible<T>::value,
+                  "Allocating with non-trivial constructor on the device memory is not supported");
+    int dev = cudautils::currentDevice();
+    void *mem = cudautils::allocate_device(dev, sizeof(T), stream);
+    return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            device::impl::DeviceDeleter{dev}};
+  }
+
+  template <typename T>
+  typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique(size_t n,
+                                                                                            cudaStream_t stream) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value,
+                  "Allocating with non-trivial constructor on the device memory is not supported");
+    int dev = cudautils::currentDevice();
+    void *mem = cudautils::allocate_device(dev, n * sizeof(element_type), stream);
+    return typename device::impl::make_device_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  device::impl::DeviceDeleter{dev}};
+  }
+
+  template <typename T, typename... Args>
+  typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique(Args &&...) = delete;
+
+  // No check for the trivial constructor, make it clear in the interface
+  template <typename T>
+  typename device::impl::make_device_unique_selector<T>::non_array make_device_unique_uninitialized(
+      cudaStream_t stream) {
+    int dev = cudautils::currentDevice();
+    void *mem = cudautils::allocate_device(dev, sizeof(T), stream);
+    return typename device::impl::make_device_unique_selector<T>::non_array{reinterpret_cast<T *>(mem),
+                                                                            device::impl::DeviceDeleter{dev}};
+  }
+
+  template <typename T>
+  typename device::impl::make_device_unique_selector<T>::unbounded_array make_device_unique_uninitialized(
+      size_t n, cudaStream_t stream) {
+    using element_type = typename std::remove_extent<T>::type;
+    int dev = cudautils::currentDevice();
+    void *mem = cudautils::allocate_device(dev, n * sizeof(element_type), stream);
+    return typename device::impl::make_device_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem),
+                                                                                  device::impl::DeviceDeleter{dev}};
+  }
+
+  template <typename T, typename... Args>
+  typename device::impl::make_device_unique_selector<T>::bounded_array make_device_unique_uninitialized(Args &&...) =
+      delete;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h b/HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h
new file mode 100644
index 0000000000000..60be11dd83a6a
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/eventIsOccurred.h
@@ -0,0 +1,22 @@
+#ifndef HeterogeneousCore_CUDAUtilities_eventIsOccurred_h
+#define HeterogeneousCore_CUDAUtilities_eventIsOccurred_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  inline bool eventIsOccurred(cudaEvent_t event) {
+    const auto ret = cudaEventQuery(event);
+    if (ret == cudaSuccess) {
+      return true;
+    } else if (ret == cudaErrorNotReady) {
+      return false;
+    }
+    // leave error case handling to cudaCheck
+    cudaCheck(ret);
+    return false;  // to keep compiler happy
+  }
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
new file mode 100644
index 0000000000000..4e62ab9e8d4c8
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
@@ -0,0 +1,73 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+
+#include <memory>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+namespace cudautils {
+  namespace host {
+    namespace noncached {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class HostDeleter {
+        public:
+          void operator()(void *ptr) { cudaCheck(cudaFreeHost(ptr)); }
+        };
+      }  // namespace impl
+
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_host_unique_selector {
+          using non_array = cudautils::host::noncached::unique_ptr<T>;
+        };
+        template <typename T>
+        struct make_host_unique_selector<T[]> {
+          using unbounded_array = cudautils::host::noncached::unique_ptr<T[]>;
+        };
+        template <typename T, size_t N>
+        struct make_host_unique_selector<T[N]> {
+          struct bounded_array {};
+        };
+      }  // namespace impl
+    }    // namespace noncached
+  }      // namespace host
+
+  /**
+   * The difference wrt. make_host_unique is that these
+   * do not cache, so they should not be called per-event.
+   */
+  template <typename T>
+  typename host::noncached::impl::make_host_unique_selector<T>::non_array make_host_noncached_unique(
+      unsigned int flags = cudaHostAllocDefault) {
+    static_assert(std::is_trivially_constructible<T>::value,
+                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem;
+    cudaCheck(cudaHostAlloc(&mem, sizeof(T), flags));
+    return
+        typename cudautils::host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
+  }
+
+  template <typename T>
+  typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array make_host_noncached_unique(
+      size_t n, unsigned int flags = cudaHostAllocDefault) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value,
+                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem;
+    cudaCheck(cudaHostAlloc(&mem, n * sizeof(element_type), flags));
+    return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::unbounded_array(
+        reinterpret_cast<element_type *>(mem));
+  }
+
+  template <typename T, typename... Args>
+  typename cudautils::host::noncached::impl::make_host_unique_selector<T>::bounded_array make_host_noncached_unique(
+      Args &&...) = delete;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
new file mode 100644
index 0000000000000..a4405ebf7fd75
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -0,0 +1,78 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+
+namespace cudautils {
+  namespace host {
+    namespace impl {
+      // Additional layer of types to distinguish from host::unique_ptr
+      class HostDeleter {
+      public:
+        void operator()(void *ptr) { cudautils::free_host(ptr); }
+      };
+    }  // namespace impl
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+    namespace impl {
+      template <typename T>
+      struct make_host_unique_selector {
+        using non_array = cudautils::host::unique_ptr<T>;
+      };
+      template <typename T>
+      struct make_host_unique_selector<T[]> {
+        using unbounded_array = cudautils::host::unique_ptr<T[]>;
+      };
+      template <typename T, size_t N>
+      struct make_host_unique_selector<T[N]> {
+        struct bounded_array {};
+      };
+    }  // namespace impl
+  }    // namespace host
+
+  // Allocate pinned host memory
+  template <typename T>
+  typename host::impl::make_host_unique_selector<T>::non_array make_host_unique(cudaStream_t stream) {
+    static_assert(std::is_trivially_constructible<T>::value,
+                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem = allocate_host(sizeof(T), stream);
+    return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+  }
+
+  template <typename T>
+  typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique(size_t n, cudaStream_t stream) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value,
+                  "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem = allocate_host(n * sizeof(element_type), stream);
+    return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+  }
+
+  template <typename T, typename... Args>
+  typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique(Args &&...) = delete;
+
+  // No check for the trivial constructor, make it clear in the interface
+  template <typename T>
+  typename host::impl::make_host_unique_selector<T>::non_array make_host_unique_uninitialized(cudaStream_t stream) {
+    void *mem = allocate_host(sizeof(T), stream);
+    return typename host::impl::make_host_unique_selector<T>::non_array{reinterpret_cast<T *>(mem)};
+  }
+
+  template <typename T>
+  typename host::impl::make_host_unique_selector<T>::unbounded_array make_host_unique_uninitialized(
+      size_t n, cudaStream_t stream) {
+    using element_type = typename std::remove_extent<T>::type;
+    void *mem = allocate_host(n * sizeof(element_type), stream);
+    return typename host::impl::make_host_unique_selector<T>::unbounded_array{reinterpret_cast<element_type *>(mem)};
+  }
+
+  template <typename T, typename... Args>
+  typename host::impl::make_host_unique_selector<T>::bounded_array make_host_unique_uninitialized(Args &&...) = delete;
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/launch.h b/HeterogeneousCore/CUDAUtilities/interface/launch.h
new file mode 100644
index 0000000000000..24282d690005e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/launch.h
@@ -0,0 +1,148 @@
+#ifndef HeterogeneousCore_CUDAUtilities_launch_h
+#define HeterogeneousCore_CUDAUtilities_launch_h
+
+#include <tuple>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+/*
+ * `cudautils::launch` and `cudautils::launch_cooperative` are wrappers around
+ * the CUDA Runtime API calls to setup and call a CUDA kernel from the host.
+ *
+ * `kernel` should be a pointer to a __global__ void(...) function.
+ * `config` describe the launch configuration: the grid size and block size, the
+ *          dynamic shared memory size (default to 0) and the CUDA stream to use
+ *          (default to 0, the default stream).
+ * `args` are the arguments passed (by value) to the kernel.
+ *
+ *  Currently this is requires an extra copy to perform the necessary implicit
+ *  conversions and ensure that the arguments match the kernel function signature;
+ *  the extra copy could eventually be avoided for arguments that are already of
+ *  the exact type.
+ *
+ *  Unlike the `kernel<<<...>>>(...)` syntax and the `cuda::launch(...)` 
+ *  implementation from the CUDA API Wrappers, `cudautils::launch(...)` and 
+ *  `cudautils::launch_cooperative` can be called from standard C++ host code.
+ *
+ *  Possible optimisations
+ *
+ *    - once C++17 is available in CUDA, replace the `pointer_setter` functor
+ *      with a simpler function using fold expressions:
+ *
+ *  template<int N, class Tuple, std::size_t... Is>
+ *  void pointer_setter(void* ptrs[N], Tuple const& t, std::index_sequence<Is...>)
+ *  {
+ *    ((ptrs[Is] = & std::get<Is>(t)), ...);
+ *  }
+ *
+ *    - add a template specialisation to `launch` and `launch_cooperative` to
+ *      avoid making a temporary copy of the parameters when they match the
+ *      kernel signature.
+ */
+
+namespace cudautils {
+
+  struct LaunchParameters {
+    dim3 gridDim;
+    dim3 blockDim;
+    size_t sharedMem;
+    cudaStream_t stream;
+
+    LaunchParameters(dim3 gridDim, dim3 blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+        : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+
+    LaunchParameters(int gridDim, int blockDim, size_t sharedMem = 0, cudaStream_t stream = nullptr)
+        : gridDim(gridDim), blockDim(blockDim), sharedMem(sharedMem), stream(stream) {}
+  };
+
+  namespace detail {
+
+    template <typename T>
+    struct kernel_traits;
+
+    template <typename... Args>
+    struct kernel_traits<void(Args...)> {
+      static constexpr size_t arguments_size = sizeof...(Args);
+
+      using argument_type_tuple = std::tuple<Args...>;
+
+      template <size_t i>
+      using argument_type = typename std::tuple_element<i, argument_type_tuple>::type;
+    };
+
+    // fill an array with the pointers to the elements of a tuple
+    template <int I>
+    struct pointer_setter {
+      template <typename Tuple>
+      void operator()(void const* ptrs[], Tuple const& t) {
+        pointer_setter<I - 1>()(ptrs, t);
+        ptrs[I - 1] = &std::get<I - 1>(t);
+      }
+    };
+
+    template <>
+    struct pointer_setter<0> {
+      template <typename Tuple>
+      void operator()(void const* ptrs[], Tuple const& t) {}
+    };
+
+#if 0
+#endif
+
+  }  // namespace detail
+
+  // wrappers for cudaLaunchKernel
+
+  void launch(void (*kernel)(), LaunchParameters config) {
+    cudaCheck(cudaLaunchKernel(
+        (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+  }
+
+  template <typename F, typename... Args>
+#if __cplusplus >= 201703L
+  std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+#else
+  std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+#endif
+  launch(F* kernel, LaunchParameters config, Args&&... args) {
+    using function_type = detail::kernel_traits<F>;
+    typename function_type::argument_type_tuple args_copy(args...);
+
+    constexpr auto size = function_type::arguments_size;
+    void const* pointers[size];
+
+    detail::pointer_setter<size>()(pointers, args_copy);
+    cudaCheck(cudaLaunchKernel(
+        (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+  }
+
+  // wrappers for cudaLaunchCooperativeKernel
+
+  void launch_cooperative(void (*kernel)(), LaunchParameters config) {
+    cudaCheck(cudaLaunchCooperativeKernel(
+        (const void*)kernel, config.gridDim, config.blockDim, nullptr, config.sharedMem, config.stream));
+  }
+
+  template <typename F, typename... Args>
+#if __cplusplus >= 201703L
+  std::enable_if_t<std::is_invocable_r<void, F, Args&&...>::value>
+#else
+  std::enable_if_t<std::is_void<std::result_of_t<F && (Args && ...)> >::value>
+#endif
+  launch_cooperative(F* kernel, LaunchParameters config, Args&&... args) {
+    using function_type = detail::kernel_traits<F>;
+    typename function_type::argument_type_tuple args_copy(args...);
+
+    constexpr auto size = function_type::arguments_size;
+    void const* pointers[size];
+
+    detail::pointer_setter<size>()(pointers, args_copy);
+    cudaCheck(cudaLaunchCooperativeKernel(
+        (const void*)kernel, config.gridDim, config.blockDim, (void**)pointers, config.sharedMem, config.stream));
+  }
+
+}  // namespace cudautils
+
+#endif  // HeterogeneousCore_CUDAUtilities_launch_h
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
new file mode 100644
index 0000000000000..203aac78a165c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -0,0 +1,30 @@
+#ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
+#define HeterogeneousCore_CUDAUtilities_memsetAsync_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <type_traits>
+
+namespace cudautils {
+  template <typename T>
+  inline void memsetAsync(cudautils::device::unique_ptr<T>& ptr, T value, cudaStream_t stream) {
+    // Shouldn't compile for array types because of sizeof(T), but
+    // let's add an assert with a more helpful message
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cudaCheck(cudaMemsetAsync(ptr.get(), value, sizeof(T), stream));
+  }
+
+  /**
+   * The type of `value` is `int` because of `cudaMemsetAsync()` takes
+   * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
+   * each **byte** to `value`. This may lead to unexpected results if
+   * `sizeof(T) > 1` and `value != 0`.
+   */
+  template <typename T>
+  inline void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cudaStream_t stream) {
+    cudaCheck(cudaMemsetAsync(ptr.get(), value, nelements * sizeof(T), stream));
+  }
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h
new file mode 100644
index 0000000000000..adb919015d79c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h
@@ -0,0 +1,8 @@
+#ifndef HeterogeneousCore_CUDAUtilities_requireCUDADevices_h
+#define HeterogeneousCore_CUDAUtilities_requireCUDADevices_h
+
+bool hasCUDADevices();
+
+void requireCUDADevices();
+
+#endif  // HeterogeneousCore_CUDAUtilities_requireCUDADevices_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
new file mode 100644
index 0000000000000..ffc881879466c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/CUDAEventCache.cc
@@ -0,0 +1,44 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAEventCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
+
+namespace cudautils {
+  void CUDAEventCache::Deleter::operator()(cudaEvent_t event) const {
+    if (device_ != -1) {
+      ScopedSetDevice deviceGuard{device_};
+      cudaCheck(cudaEventDestroy(event));
+    }
+  }
+
+  // CUDAEventCache should be constructed by the first call to
+  // getCUDAEventCache() only if we have CUDA devices present
+  CUDAEventCache::CUDAEventCache() : cache_(cudautils::cudaDeviceCount()) {}
+
+  SharedEventPtr CUDAEventCache::getCUDAEvent() {
+    const auto dev = cudautils::currentDevice();
+    return cache_[dev].makeOrGet([dev]() {
+      // TODO(?): We should not return a recorded, but not-yet-occurred event
+      cudaEvent_t event;
+      // it should be a bit faster to ignore timings
+      cudaCheck(cudaEventCreateWithFlags(&event, cudaEventDisableTiming));
+      return std::unique_ptr<BareEvent, Deleter>(event, Deleter{dev});
+    });
+  }
+
+  void CUDAEventCache::clear() {
+    // Reset the contents of the caches, but leave an
+    // edm::ReusableObjectHolder alive for each device. This is needed
+    // mostly for the unit tests, where the function-static
+    // CUDAEventCache lives through multiple tests (and go through
+    // multiple shutdowns of the framework).
+    cache_.clear();
+    cache_.resize(cudautils::cudaDeviceCount());
+  }
+
+  CUDAEventCache& getCUDAEventCache() {
+    static CUDAEventCache cache;
+    return cache;
+  }
+}  // namespace cudautils
diff --git a/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc b/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
new file mode 100644
index 0000000000000..adf0f6c092f34
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/CUDAStreamCache.cc
@@ -0,0 +1,42 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/CUDAStreamCache.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
+
+namespace cudautils {
+  void CUDAStreamCache::Deleter::operator()(cudaStream_t stream) const {
+    if (device_ != -1) {
+      ScopedSetDevice deviceGuard{device_};
+      cudaCheck(cudaStreamDestroy(stream));
+    }
+  }
+
+  // CUDAStreamCache should be constructed by the first call to
+  // getCUDAStreamCache() only if we have CUDA devices present
+  CUDAStreamCache::CUDAStreamCache() : cache_(cudautils::cudaDeviceCount()) {}
+
+  SharedStreamPtr CUDAStreamCache::getCUDAStream() {
+    const auto dev = cudautils::currentDevice();
+    return cache_[dev].makeOrGet([dev]() {
+      cudaStream_t stream;
+      cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+      return std::unique_ptr<BareStream, Deleter>(stream, Deleter{dev});
+    });
+  }
+
+  void CUDAStreamCache::clear() {
+    // Reset the contents of the caches, but leave an
+    // edm::ReusableObjectHolder alive for each device. This is needed
+    // mostly for the unit tests, where the function-static
+    // CUDAStreamCache lives through multiple tests (and go through
+    // multiple shutdowns of the framework).
+    cache_.clear();
+    cache_.resize(cudautils::cudaDeviceCount());
+  }
+
+  CUDAStreamCache& getCUDAStreamCache() {
+    static CUDAStreamCache cache;
+    return cache;
+  }
+}  // namespace cudautils
diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
new file mode 100644
index 0000000000000..6e07fb6c4a8ed
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingDeviceAllocator.h
@@ -0,0 +1,713 @@
+#ifndef HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
+#define HeterogenousCore_CUDAUtilities_src_CachingDeviceAllocator_h
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Forked to CMSSW by Matti Kortelainen
+ */
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#include <cmath>
+#include <map>
+#include <set>
+
+#include <cub/util_debug.cuh>
+#include <cub/host/mutex.cuh>
+
+/// CUB namespace
+namespace notcub {
+
+  /**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+  /******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+  /**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+  struct CachingDeviceAllocator {
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int)-1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t)-1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor {
+      void *d_ptr;                     // Device pointer
+      size_t bytes;                    // Size of allocation in bytes
+      unsigned int bin;                // Bin enumeration
+      int device;                      // device ordinal
+      cudaStream_t associated_stream;  // Associated associated_stream
+      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+
+      // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+      BlockDescriptor(void *d_ptr, int device)
+          : d_ptr(d_ptr), bytes(0), bin(INVALID_BIN), device(device), associated_stream(nullptr), ready_event(nullptr) {}
+
+      // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+      BlockDescriptor(int device)
+          : d_ptr(nullptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Comparison functor for comparing device pointers
+      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
+        if (a.device == b.device)
+          return (a.d_ptr < b.d_ptr);
+        else
+          return (a.device < b.device);
+      }
+
+      // Comparison functor for comparing allocation sizes
+      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) {
+        if (a.device == b.device)
+          return (a.bytes < b.bytes);
+        else
+          return (a.device < b.device);
+      }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+      size_t free;
+      size_t live;
+      TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(unsigned int base, unsigned int exp) {
+      unsigned int retval = 1;
+      while (exp > 0) {
+        if (exp & 1) {
+          retval = retval * base;  // multiply the result by the current base
+        }
+        base = base * base;  // square the base
+        exp = exp >> 1;      // divide the exponent in half
+      }
+      return retval;
+    }
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
+      power = 0;
+      rounded_bytes = 1;
+
+      if (value * base < value) {
+        // Overflow
+        power = sizeof(size_t) * 8;
+        rounded_bytes = size_t(0) - 1;
+        return;
+      }
+
+      while (rounded_bytes < value) {
+        rounded_bytes *= base;
+        power++;
+      }
+    }
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex mutex;  /// Mutex for thread-safety
+
+    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
+    unsigned int min_bin;     /// Minimum bin enumeration
+    unsigned int max_bin;     /// Maximum bin enumeration
+
+    size_t min_bin_bytes;     /// Minimum bin size
+    size_t max_bin_bytes;     /// Maximum bin size
+    size_t max_cached_bytes;  /// Maximum aggregate cached bytes per device
+
+    const bool
+        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool debug;        /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes cached_bytes;  /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks cached_blocks;   /// Set of cached device allocations available for reuse
+    BusyBlocks live_blocks;       /// Set of live device allocations currently in use
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool skip_cleanup =
+            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+        : bin_growth(bin_growth),
+          min_bin(min_bin),
+          max_bin(max_bin),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes(max_cached_bytes),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(bool skip_cleanup = false, bool debug = false)
+        : bin_growth(8),
+          min_bin(3),
+          max_bin(7),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes((max_bin_bytes * 3) - 1),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(size_t max_cached_bytes) {
+      // Lock
+      mutex.Lock();
+
+      if (debug)
+        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
+                (long long)this->max_cached_bytes,
+                (long long)max_cached_bytes);
+
+      this->max_cached_bytes = max_cached_bytes;
+
+      // Unlock
+      mutex.Unlock();
+
+      return cudaSuccess;
+    }
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int device,                            ///< [in] Device on which to place the allocation
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      *d_ptr = nullptr;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      if (device == INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          return error;
+        device = entrypoint_device;
+      }
+
+      // Create a block descriptor for the requested allocation
+      bool found = false;
+      BlockDescriptor search_key(device);
+      search_key.associated_stream = active_stream;
+      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+      if (search_key.bin > max_bin) {
+        // Bin is greater than our maximum bin: allocate the request
+        // exactly and give out-of-bounds bin.  It will not be cached
+        // for reuse when returned.
+        search_key.bin = INVALID_BIN;
+        search_key.bytes = bytes;
+      } else {
+        // Search for a suitable cached allocation: lock
+        mutex.Lock();
+
+        if (search_key.bin < min_bin) {
+          // Bin is less than minimum bin: round up
+          search_key.bin = min_bin;
+          search_key.bytes = min_bin_bytes;
+        }
+
+        // Iterate through the range of cached blocks on the same device in the same bin
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+        while ((block_itr != cached_blocks.end()) && (block_itr->device == device) &&
+               (block_itr->bin == search_key.bin)) {
+          // To prevent races with reusing blocks returned by the host but still
+          // in use by the device, only consider cached blocks that are
+          // either (from the active stream) or (from an idle stream)
+          if ((active_stream == block_itr->associated_stream) ||
+              (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady)) {
+            // Reuse existing cache block.  Insert into live blocks.
+            found = true;
+            search_key = *block_itr;
+            search_key.associated_stream = active_stream;
+            live_blocks.insert(search_key);
+
+            // Remove from free blocks
+            cached_bytes[device].free -= search_key.bytes;
+            cached_bytes[device].live += search_key.bytes;
+
+            if (debug)
+              _CubLog(
+                  "\tDevice %d reused cached block at %p (%lld bytes) for stream %lld, event %lld (previously "
+                  "associated with stream %lld, event %lld).\n",
+                  device,
+                  search_key.d_ptr,
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)block_itr->associated_stream,
+                  (long long)block_itr->ready_event);
+
+            cached_blocks.erase(block_itr);
+
+            break;
+          }
+          block_itr++;
+        }
+
+        // Done searching: unlock
+        mutex.Unlock();
+      }
+
+      // Allocate the block if necessary
+      if (!found) {
+        // Set runtime's current device to specified device (entrypoint may not be set)
+        if (device != entrypoint_device) {
+          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+            return error;
+          if (CubDebug(error = cudaSetDevice(device)))
+            return error;
+        }
+
+        // Attempt to allocate
+        if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation) {
+          // The allocation attempt failed: free all cached blocks on device and retry
+          if (debug)
+            _CubLog(
+                "\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                device,
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream);
+
+          error = cudaSuccess;  // Reset the error we will return
+          cudaGetLastError();   // Reset CUDART's error
+
+          // Lock
+          mutex.Lock();
+
+          // Iterate the range of free blocks on the same device
+          BlockDescriptor free_key(device);
+          CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+          while ((block_itr != cached_blocks.end()) && (block_itr->device == device)) {
+            // No need to worry about synchronization with the device: cudaFree is
+            // blocking and will synchronize across all kernels executing
+            // on the current device
+
+            // Free device memory and destroy stream event.
+            if (CubDebug(error = cudaFree(block_itr->d_ptr)))
+              break;
+            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+              break;
+
+            // Reduce balance and erase entry
+            cached_bytes[device].free -= block_itr->bytes;
+
+            if (debug)
+              _CubLog(
+                  "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks "
+                  "(%lld bytes) outstanding.\n",
+                  device,
+                  (long long)block_itr->bytes,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes[device].free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes[device].live);
+
+            cached_blocks.erase(block_itr);
+
+            block_itr++;
+          }
+
+          // Unlock
+          mutex.Unlock();
+
+          // Return under error
+          if (error)
+            return error;
+
+          // Try to allocate again
+          if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)))
+            return error;
+        }
+
+        // Create ready event
+        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+          return error;
+
+        // Insert into live blocks
+        mutex.Lock();
+        live_blocks.insert(search_key);
+        cached_bytes[device].live += search_key.bytes;
+        mutex.Unlock();
+
+        if (debug)
+          _CubLog(
+              "\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld, event %lld).\n",
+              device,
+              search_key.d_ptr,
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event);
+
+        // Attempt to revert back to previous device if necessary
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
+          if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+            return error;
+        }
+      }
+
+      // Copy device pointer to output parameter
+      *d_ptr = search_key.d_ptr;
+
+      if (debug)
+        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes[device].free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes[device].live);
+
+      return error;
+    }
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(int device, void *d_ptr) {
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      if (device == INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          return error;
+        device = entrypoint_device;
+      }
+
+      // Lock
+      mutex.Lock();
+
+      // Find corresponding block descriptor
+      bool recached = false;
+      BlockDescriptor search_key(d_ptr, device);
+      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+      if (block_itr != live_blocks.end()) {
+        // Remove from live blocks
+        search_key = *block_itr;
+        live_blocks.erase(block_itr);
+        cached_bytes[device].live -= search_key.bytes;
+
+        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+        if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes)) {
+          // Insert returned allocation into free blocks
+          recached = true;
+          cached_blocks.insert(search_key);
+          cached_bytes[device].free += search_key.bytes;
+
+          if (debug)
+            _CubLog(
+                "\tDevice %d returned %lld bytes at %p from associated stream %lld, event %lld.\n\t\t %lld available "
+                "blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                device,
+                (long long)search_key.bytes,
+                d_ptr,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes[device].free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes[device].live);
+        }
+      }
+
+      // First set to specified device (entrypoint may not be set)
+      if (device != entrypoint_device) {
+        if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+          return error;
+        if (CubDebug(error = cudaSetDevice(device)))
+          return error;
+      }
+
+      if (recached) {
+        // Insert the ready event in the associated stream (must have current device set properly)
+        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
+          return error;
+      }
+
+      // Unlock
+      mutex.Unlock();
+
+      if (!recached) {
+        // Free the allocation from the runtime and cleanup the event.
+        if (CubDebug(error = cudaFree(d_ptr)))
+          return error;
+        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
+          return error;
+
+        if (debug)
+          _CubLog(
+              "\tDevice %d freed %lld bytes at %p from associated stream %lld, event %lld.\n\t\t  %lld available "
+              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+              device,
+              (long long)search_key.bytes,
+              d_ptr,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes[device].free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes[device].live);
+      }
+
+      // Reset device
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device)) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(void *d_ptr) { return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr); }
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached() {
+      cudaError_t error = cudaSuccess;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      int current_device = INVALID_DEVICE_ORDINAL;
+
+      mutex.Lock();
+
+      while (!cached_blocks.empty()) {
+        // Get first block
+        CachedBlocks::iterator begin = cached_blocks.begin();
+
+        // Get entry-point device ordinal if necessary
+        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
+          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+            break;
+        }
+
+        // Set current device ordinal if necessary
+        if (begin->device != current_device) {
+          if (CubDebug(error = cudaSetDevice(begin->device)))
+            break;
+          current_device = begin->device;
+        }
+
+        // Free device memory
+        if (CubDebug(error = cudaFree(begin->d_ptr)))
+          break;
+        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+          break;
+
+        // Reduce balance and erase entry
+        cached_bytes[current_device].free -= begin->bytes;
+
+        if (debug)
+          _CubLog(
+              "\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+              "bytes) outstanding.\n",
+              current_device,
+              (long long)begin->bytes,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes[current_device].free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes[current_device].live);
+
+        cached_blocks.erase(begin);
+      }
+
+      mutex.Unlock();
+
+      // Attempt to revert back to entry-point device if necessary
+      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator() {
+      if (!skip_cleanup)
+        FreeAllCached();
+    }
+  };
+
+  /** @} */  // end group UtilMgmt
+
+}  // namespace notcub
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h
new file mode 100644
index 0000000000000..53901e1f1cb27
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/CachingHostAllocator.h
@@ -0,0 +1,661 @@
+#ifndef HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
+#define HeterogenousCore_CUDAUtilities_src_CachingHostAllocator_h
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * Modified to cache pinned host allocations by Matti Kortelainen
+ */
+
+/******************************************************************************
+ * Simple caching allocator for pinned host memory allocations. The allocator is
+ * thread-safe.
+ ******************************************************************************/
+
+#include <cmath>
+#include <map>
+#include <set>
+
+#include <cub/util_debug.cuh>
+#include <cub/host/mutex.cuh>
+
+/// CUB namespace
+namespace notcub {
+
+  /**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+  /******************************************************************************
+ * CachingHostAllocator (host use)
+ ******************************************************************************/
+
+  /**
+ * \brief A simple caching allocator pinned host memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe.  It behaves as follows:
+ *
+ * I presume the CUDA stream-safeness is not useful as to read/write
+ * from/to the pinned host memory one needs to synchronize anyway. The
+ * difference wrt. device memory is that in the CPU all operations to
+ * the device memory are scheduled via the CUDA stream, while for the
+ * host memory one can perform operations directly.
+ *
+ * \par
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused host allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations  will exceed
+ *   \p max_cached_bytes, allocations are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingHostAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes
+ *
+ */
+  struct CachingHostAllocator {
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int)-1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t)-1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS  // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for pinned host memory allocations
+     */
+    struct BlockDescriptor {
+      void *d_ptr;                     // Host pointer
+      size_t bytes;                    // Size of allocation in bytes
+      unsigned int bin;                // Bin enumeration
+      int device;                      // device ordinal
+      cudaStream_t associated_stream;  // Associated associated_stream
+      cudaEvent_t ready_event;  // Signal when associated stream has run to the point at which this block was freed
+
+      // Constructor (suitable for searching maps for a specific block, given its pointer)
+      BlockDescriptor(void *d_ptr)
+          : d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Constructor (suitable for searching maps for a range of suitable blocks)
+      BlockDescriptor()
+          : d_ptr(nullptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(INVALID_DEVICE_ORDINAL),
+            associated_stream(nullptr),
+            ready_event(nullptr) {}
+
+      // Comparison functor for comparing host pointers
+      static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.d_ptr < b.d_ptr); }
+
+      // Comparison functor for comparing allocation sizes
+      static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b) { return (a.bytes < b.bytes); }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+      size_t free;
+      size_t live;
+      TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(unsigned int base, unsigned int exp) {
+      unsigned int retval = 1;
+      while (exp > 0) {
+        if (exp & 1) {
+          retval = retval * base;  // multiply the result by the current base
+        }
+        base = base * base;  // square the base
+        exp = exp >> 1;      // divide the exponent in half
+      }
+      return retval;
+    }
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(unsigned int &power, size_t &rounded_bytes, unsigned int base, size_t value) {
+      power = 0;
+      rounded_bytes = 1;
+
+      if (value * base < value) {
+        // Overflow
+        power = sizeof(size_t) * 8;
+        rounded_bytes = size_t(0) - 1;
+        return;
+      }
+
+      while (rounded_bytes < value) {
+        rounded_bytes *= base;
+        power++;
+      }
+    }
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex mutex;  /// Mutex for thread-safety
+
+    unsigned int bin_growth;  /// Geometric growth factor for bin-sizes
+    unsigned int min_bin;     /// Minimum bin enumeration
+    unsigned int max_bin;     /// Maximum bin enumeration
+
+    size_t min_bin_bytes;     /// Minimum bin size
+    size_t max_bin_bytes;     /// Maximum bin size
+    size_t max_cached_bytes;  /// Maximum aggregate cached bytes
+
+    const bool
+        skip_cleanup;  /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool debug;        /// Whether or not to print (de)allocation events to stdout
+
+    TotalBytes cached_bytes;     /// Aggregate cached bytes
+    CachedBlocks cached_blocks;  /// Set of cached pinned host allocations available for reuse
+    BusyBlocks live_blocks;      /// Set of live pinned host allocations currently in use
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingHostAllocator(
+        unsigned int bin_growth,                 ///< Geometric growth factor for bin-sizes
+        unsigned int min_bin = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int max_bin = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t max_cached_bytes = INVALID_SIZE,  ///< Maximum aggregate cached bytes (default is no limit)
+        bool skip_cleanup =
+            false,  ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool debug = false)  ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+        : bin_growth(bin_growth),
+          min_bin(min_bin),
+          max_bin(max_bin),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes(max_cached_bytes),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes
+     */
+    CachingHostAllocator(bool skip_cleanup = false, bool debug = false)
+        : bin_growth(8),
+          min_bin(3),
+          max_bin(7),
+          min_bin_bytes(IntPow(bin_growth, min_bin)),
+          max_bin_bytes(IntPow(bin_growth, max_bin)),
+          max_cached_bytes((max_bin_bytes * 3) - 1),
+          skip_cleanup(skip_cleanup),
+          debug(debug),
+          cached_blocks(BlockDescriptor::SizeCompare),
+          live_blocks(BlockDescriptor::PtrCompare) {}
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    void SetMaxCachedBytes(size_t max_cached_bytes) {
+      // Lock
+      mutex.Lock();
+
+      if (debug)
+        _CubLog("Changing max_cached_bytes (%lld -> %lld)\n",
+                (long long)this->max_cached_bytes,
+                (long long)max_cached_bytes);
+
+      this->max_cached_bytes = max_cached_bytes;
+
+      // Unlock
+      mutex.Unlock();
+    }
+
+    /**
+     * \brief Provides a suitable allocation of pinned host memory for the given size.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostAllocate(
+        void **d_ptr,                          ///< [out] Reference to pointer to the allocation
+        size_t bytes,                          ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t active_stream = nullptr)  ///< [in] The stream to be associated with this allocation
+    {
+      *d_ptr = nullptr;
+      int device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      if (CubDebug(error = cudaGetDevice(&device)))
+        return error;
+
+      // Create a block descriptor for the requested allocation
+      bool found = false;
+      BlockDescriptor search_key;
+      search_key.device = device;
+      search_key.associated_stream = active_stream;
+      NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+      if (search_key.bin > max_bin) {
+        // Bin is greater than our maximum bin: allocate the request
+        // exactly and give out-of-bounds bin.  It will not be cached
+        // for reuse when returned.
+        search_key.bin = INVALID_BIN;
+        search_key.bytes = bytes;
+      } else {
+        // Search for a suitable cached allocation: lock
+        mutex.Lock();
+
+        if (search_key.bin < min_bin) {
+          // Bin is less than minimum bin: round up
+          search_key.bin = min_bin;
+          search_key.bytes = min_bin_bytes;
+        }
+
+        // Iterate through the range of cached blocks in the same bin
+        CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+        while ((block_itr != cached_blocks.end()) && (block_itr->bin == search_key.bin)) {
+          // To prevent races with reusing blocks returned by the host but still
+          // in use for transfers, only consider cached blocks that are from an idle stream
+          if (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady) {
+            // Reuse existing cache block.  Insert into live blocks.
+            found = true;
+            search_key = *block_itr;
+            search_key.associated_stream = active_stream;
+            if (search_key.device != device) {
+              // If "associated" device changes, need to re-create the event on the right device
+              if (CubDebug(error = cudaSetDevice(search_key.device)))
+                return error;
+              if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
+                return error;
+              if (CubDebug(error = cudaSetDevice(device)))
+                return error;
+              if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+              search_key.device = device;
+            }
+
+            live_blocks.insert(search_key);
+
+            // Remove from free blocks
+            cached_bytes.free -= search_key.bytes;
+            cached_bytes.live += search_key.bytes;
+
+            if (debug)
+              _CubLog(
+                  "\tHost reused cached block at %p (%lld bytes) for stream %lld, event %lld on device %lld "
+                  "(previously associated with stream %lld, event %lld).\n",
+                  search_key.d_ptr,
+                  (long long)search_key.bytes,
+                  (long long)search_key.associated_stream,
+                  (long long)search_key.ready_event,
+                  (long long)search_key.device,
+                  (long long)block_itr->associated_stream,
+                  (long long)block_itr->ready_event);
+
+            cached_blocks.erase(block_itr);
+
+            break;
+          }
+          block_itr++;
+        }
+
+        // Done searching: unlock
+        mutex.Unlock();
+      }
+
+      // Allocate the block if necessary
+      if (!found) {
+        // Attempt to allocate
+        // TODO: eventually support allocation flags
+        if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)) ==
+            cudaErrorMemoryAllocation) {
+          // The allocation attempt failed: free all cached blocks on device and retry
+          if (debug)
+            _CubLog(
+                "\tHost failed to allocate %lld bytes for stream %lld on device %lld, retrying after freeing cached "
+                "allocations",
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.device);
+
+          error = cudaSuccess;  // Reset the error we will return
+          cudaGetLastError();   // Reset CUDART's error
+
+          // Lock
+          mutex.Lock();
+
+          // Iterate the range of free blocks
+          CachedBlocks::iterator block_itr = cached_blocks.begin();
+
+          while ((block_itr != cached_blocks.end())) {
+            // No need to worry about synchronization with the device: cudaFree is
+            // blocking and will synchronize across all kernels executing
+            // on the current device
+
+            // Free pinned host memory.
+            if (CubDebug(error = cudaFreeHost(block_itr->d_ptr)))
+              break;
+            if (CubDebug(error = cudaEventDestroy(block_itr->ready_event)))
+              break;
+
+            // Reduce balance and erase entry
+            cached_bytes.free -= block_itr->bytes;
+
+            if (debug)
+              _CubLog(
+                  "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+                  "bytes) outstanding.\n",
+                  (long long)block_itr->bytes,
+                  (long long)cached_blocks.size(),
+                  (long long)cached_bytes.free,
+                  (long long)live_blocks.size(),
+                  (long long)cached_bytes.live);
+
+            cached_blocks.erase(block_itr);
+
+            block_itr++;
+          }
+
+          // Unlock
+          mutex.Unlock();
+
+          // Return under error
+          if (error)
+            return error;
+
+          // Try to allocate again
+          if (CubDebug(error = cudaHostAlloc(&search_key.d_ptr, search_key.bytes, cudaHostAllocDefault)))
+            return error;
+        }
+
+        // Create ready event
+        if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+          return error;
+
+        // Insert into live blocks
+        mutex.Lock();
+        live_blocks.insert(search_key);
+        cached_bytes.live += search_key.bytes;
+        mutex.Unlock();
+
+        if (debug)
+          _CubLog(
+              "\tHost allocated new host block at %p (%lld bytes associated with stream %lld, event %lld on device "
+              "%lld).\n",
+              search_key.d_ptr,
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)search_key.device);
+      }
+
+      // Copy host pointer to output parameter
+      *d_ptr = search_key.d_ptr;
+
+      if (debug)
+        _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes.free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes.live);
+
+      return error;
+    }
+
+    /**
+     * \brief Frees a live allocation of pinned host memory, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse.
+     */
+    cudaError_t HostFree(void *d_ptr) {
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      cudaError_t error = cudaSuccess;
+
+      // Lock
+      mutex.Lock();
+
+      // Find corresponding block descriptor
+      bool recached = false;
+      BlockDescriptor search_key(d_ptr);
+      BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+      if (block_itr != live_blocks.end()) {
+        // Remove from live blocks
+        search_key = *block_itr;
+        live_blocks.erase(block_itr);
+        cached_bytes.live -= search_key.bytes;
+
+        // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+        if ((search_key.bin != INVALID_BIN) && (cached_bytes.free + search_key.bytes <= max_cached_bytes)) {
+          // Insert returned allocation into free blocks
+          recached = true;
+          cached_blocks.insert(search_key);
+          cached_bytes.free += search_key.bytes;
+
+          if (debug)
+            _CubLog(
+                "\tHost returned %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t %lld "
+                "available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                (long long)search_key.bytes,
+                (long long)search_key.associated_stream,
+                (long long)search_key.ready_event,
+                (long long)search_key.device,
+                (long long)cached_blocks.size(),
+                (long long)cached_bytes.free,
+                (long long)live_blocks.size(),
+                (long long)cached_bytes.live);
+        }
+      }
+
+      if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+        return error;
+      if (entrypoint_device != search_key.device) {
+        if (CubDebug(error = cudaSetDevice(search_key.device)))
+          return error;
+      }
+
+      if (recached) {
+        // Insert the ready event in the associated stream (must have current device set properly)
+        if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream)))
+          return error;
+      }
+
+      // Unlock
+      mutex.Unlock();
+
+      if (!recached) {
+        // Free the allocation from the runtime and cleanup the event.
+        if (CubDebug(error = cudaFreeHost(d_ptr)))
+          return error;
+        if (CubDebug(error = cudaEventDestroy(search_key.ready_event)))
+          return error;
+
+        if (debug)
+          _CubLog(
+              "\tHost freed %lld bytes from associated stream %lld, event %lld on device %lld.\n\t\t  %lld available "
+              "blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+              (long long)search_key.bytes,
+              (long long)search_key.associated_stream,
+              (long long)search_key.ready_event,
+              (long long)search_key.device,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes.free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes.live);
+      }
+
+      // Reset device
+      if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != search_key.device)) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Frees all cached pinned host allocations
+     */
+    cudaError_t FreeAllCached() {
+      cudaError_t error = cudaSuccess;
+      int entrypoint_device = INVALID_DEVICE_ORDINAL;
+      int current_device = INVALID_DEVICE_ORDINAL;
+
+      mutex.Lock();
+
+      while (!cached_blocks.empty()) {
+        // Get first block
+        CachedBlocks::iterator begin = cached_blocks.begin();
+
+        // Get entry-point device ordinal if necessary
+        if (entrypoint_device == INVALID_DEVICE_ORDINAL) {
+          if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+            break;
+        }
+
+        // Set current device ordinal if necessary
+        if (begin->device != current_device) {
+          if (CubDebug(error = cudaSetDevice(begin->device)))
+            break;
+          current_device = begin->device;
+        }
+
+        // Free host memory
+        if (CubDebug(error = cudaFreeHost(begin->d_ptr)))
+          break;
+        if (CubDebug(error = cudaEventDestroy(begin->ready_event)))
+          break;
+
+        // Reduce balance and erase entry
+        cached_bytes.free -= begin->bytes;
+
+        if (debug)
+          _CubLog(
+              "\tHost freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld "
+              "bytes) outstanding.\n",
+              (long long)begin->bytes,
+              (long long)cached_blocks.size(),
+              (long long)cached_bytes.free,
+              (long long)live_blocks.size(),
+              (long long)cached_bytes.live);
+
+        cached_blocks.erase(begin);
+      }
+
+      mutex.Unlock();
+
+      // Attempt to revert back to entry-point device if necessary
+      if (entrypoint_device != INVALID_DEVICE_ORDINAL) {
+        if (CubDebug(error = cudaSetDevice(entrypoint_device)))
+          return error;
+      }
+
+      return error;
+    }
+
+    /**
+     * \brief Destructor
+     */
+    ~CachingHostAllocator() {
+      if (!skip_cleanup)
+        FreeAllCached();
+    }
+  };
+
+  /** @} */  // end group UtilMgmt
+
+}  // namespace notcub
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc b/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc
new file mode 100644
index 0000000000000..ed1faf0d91e46
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/MessageLogger.cc
@@ -0,0 +1,24 @@
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h"
+
+namespace cudautils {
+
+  LogSystem::~LogSystem() { edm::LogSystem(category_) << message_.str(); }
+
+  LogAbsolute::~LogAbsolute() { edm::LogAbsolute(category_) << message_.str(); }
+
+  LogError::~LogError() { edm::LogError(category_) << message_.str(); }
+
+  LogProblem::~LogProblem() { edm::LogProblem(category_) << message_.str(); }
+
+  LogImportant::~LogImportant() { edm::LogImportant(category_) << message_.str(); }
+
+  LogWarning::~LogWarning() { edm::LogWarning(category_) << message_.str(); }
+
+  LogPrint::~LogPrint() { edm::LogPrint(category_) << message_.str(); }
+
+  LogInfo::~LogInfo() { edm::LogInfo(category_) << message_.str(); }
+
+  LogVerbatim::~LogVerbatim() { edm::LogVerbatim(category_) << message_.str(); }
+
+}  // namespace cudautils
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
new file mode 100644
index 0000000000000..2e7b7151dea38
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_device.cc
@@ -0,0 +1,40 @@
+#include <limits>
+
+#include "FWCore/Utilities/interface/Likely.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_device.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "getCachingDeviceAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+      notcub::CachingDeviceAllocator::IntPow(cudautils::allocator::binGrowth, cudautils::allocator::maxBin);
+}
+
+namespace cudautils {
+  void *allocate_device(int dev, size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (cudautils::allocator::useCaching) {
+      if (UNLIKELY(nbytes > maxAllocationSize)) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(cudautils::allocator::getCachingDeviceAllocator().DeviceAllocate(dev, &ptr, nbytes, stream));
+    } else {
+      ScopedSetDevice setDeviceForThisScope(dev);
+      cudaCheck(cudaMalloc(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_device(int device, void *ptr) {
+    if constexpr (cudautils::allocator::useCaching) {
+      cudaCheck(cudautils::allocator::getCachingDeviceAllocator().DeviceFree(device, ptr));
+    } else {
+      ScopedSetDevice setDeviceForThisScope(device);
+      cudaCheck(cudaFree(ptr));
+    }
+  }
+
+}  // namespace cudautils
diff --git a/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
new file mode 100644
index 0000000000000..265b18671e654
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/allocate_host.cc
@@ -0,0 +1,37 @@
+#include <limits>
+
+#include "FWCore/Utilities/interface/Likely.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/allocate_host.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "getCachingHostAllocator.h"
+
+namespace {
+  const size_t maxAllocationSize =
+      notcub::CachingDeviceAllocator::IntPow(cudautils::allocator::binGrowth, cudautils::allocator::maxBin);
+}
+
+namespace cudautils {
+  void *allocate_host(size_t nbytes, cudaStream_t stream) {
+    void *ptr = nullptr;
+    if constexpr (cudautils::allocator::useCaching) {
+      if (UNLIKELY(nbytes > maxAllocationSize)) {
+        throw std::runtime_error("Tried to allocate " + std::to_string(nbytes) +
+                                 " bytes, but the allocator maximum is " + std::to_string(maxAllocationSize));
+      }
+      cudaCheck(cudautils::allocator::getCachingHostAllocator().HostAllocate(&ptr, nbytes, stream));
+    } else {
+      cudaCheck(cudaMallocHost(&ptr, nbytes));
+    }
+    return ptr;
+  }
+
+  void free_host(void *ptr) {
+    if constexpr (cudautils::allocator::useCaching) {
+      cudaCheck(cudautils::allocator::getCachingHostAllocator().HostFree(ptr));
+    } else {
+      cudaCheck(cudaFreeHost(ptr));
+    }
+  }
+
+}  // namespace cudautils
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
new file mode 100644
index 0000000000000..e545a6a7839a1
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingDeviceAllocator.h
@@ -0,0 +1,78 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingDeviceAllocator
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaDeviceCount.h"
+#include "CachingDeviceAllocator.h"
+
+#include <iomanip>
+
+namespace cudautils {
+  namespace allocator {
+    // Use caching or not
+    constexpr bool useCaching = true;
+    // Growth factor (bin_growth in cub::CachingDeviceAllocator
+    constexpr unsigned int binGrowth = 8;
+    // Smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+    constexpr unsigned int minBin = 1;
+    // Largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+    constexpr unsigned int maxBin = 10;
+    // Total storage for the allocator. 0 means no limit.
+    constexpr size_t maxCachedBytes = 0;
+    // Fraction of total device memory taken for the allocator. In case there are multiple devices with different amounts of memory, the smallest of them is taken. If maxCachedBytes is non-zero, the smallest of them is taken.
+    constexpr double maxCachedFraction = 0.8;
+    constexpr bool debug = false;
+
+    inline size_t minCachedBytes() {
+      size_t ret = std::numeric_limits<size_t>::max();
+      int currentDevice;
+      cudaCheck(cudaGetDevice(&currentDevice));
+      const int numberOfDevices = cudaDeviceCount();
+      for (int i = 0; i < numberOfDevices; ++i) {
+        size_t freeMemory, totalMemory;
+        cudaCheck(cudaSetDevice(i));
+        cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
+        ret = std::min(ret, static_cast<size_t>(maxCachedFraction * freeMemory));
+      }
+      cudaCheck(cudaSetDevice(currentDevice));
+      if (maxCachedBytes > 0) {
+        ret = std::min(ret, maxCachedBytes);
+      }
+      return ret;
+    }
+
+    inline notcub::CachingDeviceAllocator& getCachingDeviceAllocator() {
+      LogDebug("CachingDeviceAllocator").log([](auto& log) {
+        log << "cub::CachingDeviceAllocator settings\n"
+            << "  bin growth " << binGrowth << "\n"
+            << "  min bin    " << minBin << "\n"
+            << "  max bin    " << maxBin << "\n"
+            << "  resulting bins:\n";
+        for (auto bin = minBin; bin <= maxBin; ++bin) {
+          auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+          if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+            log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+          } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+            log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+          } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+            log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+          } else {
+            log << "    " << std::setw(9) << binSize << " B\n";
+          }
+        }
+        log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+      });
+
+      static notcub::CachingDeviceAllocator allocator{binGrowth,
+                                                      minBin,
+                                                      maxBin,
+                                                      minCachedBytes(),
+                                                      false,  // do not skip cleanup
+                                                      debug};
+      return allocator;
+    }
+  }  // namespace allocator
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
new file mode 100644
index 0000000000000..865e4c677d547
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/getCachingHostAllocator.h
@@ -0,0 +1,47 @@
+#ifndef HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+#define HeterogeneousCore_CUDACore_src_getCachingHostAllocator
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "CachingHostAllocator.h"
+
+#include "getCachingDeviceAllocator.h"
+
+#include <iomanip>
+
+namespace cudautils {
+  namespace allocator {
+    inline notcub::CachingHostAllocator& getCachingHostAllocator() {
+      LogDebug("CachingHostAllocator").log([](auto& log) {
+        log << "cub::CachingHostAllocator settings\n"
+            << "  bin growth " << binGrowth << "\n"
+            << "  min bin    " << minBin << "\n"
+            << "  max bin    " << maxBin << "\n"
+            << "  resulting bins:\n";
+        for (auto bin = minBin; bin <= maxBin; ++bin) {
+          auto binSize = notcub::CachingDeviceAllocator::IntPow(binGrowth, bin);
+          if (binSize >= (1 << 30) and binSize % (1 << 30) == 0) {
+            log << "    " << std::setw(8) << (binSize >> 30) << " GB\n";
+          } else if (binSize >= (1 << 20) and binSize % (1 << 20) == 0) {
+            log << "    " << std::setw(8) << (binSize >> 20) << " MB\n";
+          } else if (binSize >= (1 << 10) and binSize % (1 << 10) == 0) {
+            log << "    " << std::setw(8) << (binSize >> 10) << " kB\n";
+          } else {
+            log << "    " << std::setw(9) << binSize << " B\n";
+          }
+        }
+        log << "  maximum amount of cached memory: " << (minCachedBytes() >> 20) << " MB\n";
+      });
+
+      static notcub::CachingHostAllocator allocator{binGrowth,
+                                                    minBin,
+                                                    maxBin,
+                                                    minCachedBytes(),
+                                                    false,  // do not skip cleanup
+                                                    debug};
+      return allocator;
+    }
+  }  // namespace allocator
+}  // namespace cudautils
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/src/requireCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/requireCUDADevices.cc
new file mode 100644
index 0000000000000..a2e9949003a65
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/requireCUDADevices.cc
@@ -0,0 +1,28 @@
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+bool hasCUDADevices() {
+  int devices = 0;
+  auto status = cudaGetDeviceCount(&devices);
+  if (status != cudaSuccess) {
+    std::cerr << "Failed to initialise the CUDA runtime, the test will be skipped."
+              << "\n";
+    return false;
+  }
+  if (devices == 0) {
+    std::cerr << "No CUDA devices available, the test will be skipped."
+              << "\n";
+    return false;
+  }
+  return true;
+}
+
+void requireCUDADevices() {
+  if (not hasCUDADevices()) {
+    exit(EXIT_SUCCESS);
+  }
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
new file mode 100644
index 0000000000000..72206fe051977
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -0,0 +1,12 @@
+<use name="HeterogeneousCore/CUDAUtilities"/>
+
+<bin file="assert_t.cu" name="cudaAssert_t">
+</bin>
+
+<bin file="assert_t.cu"    name="cudaAssert_debug">
+  <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+</bin>
+
+<bin file="testCatch2Main.cpp,device_unique_ptr_t.cpp,host_unique_ptr_t.cpp,host_noncached_unique_ptr_t.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+  <use name="catch2"/>
+</bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/assert_t.cu b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
new file mode 100644
index 0000000000000..c7f6ca5faf9da
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/assert_t.cu
@@ -0,0 +1,13 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+__global__ void testIt(int one) { assert(one == 1); }
+
+int main(int argc, char* argv[]) {
+  requireCUDADevices();
+
+  testIt<<<1, 1>>>(argc);
+  cudaDeviceSynchronize();
+
+  return (argc == 1) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
new file mode 100644
index 0000000000000..0dc6e5d4528f1
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -0,0 +1,118 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+TEST_CASE("copyAsync", "[cudaMemTools]") {
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Host to device") {
+    SECTION("Single element") {
+      auto host_orig = cudautils::make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cudautils::make_device_unique<int>(stream);
+      auto host = cudautils::make_host_unique<int>(stream);
+
+      cudautils::copyAsync(device, host_orig, stream);
+      cudaCheck(cudaMemcpyAsync(host.get(), device.get(), sizeof(int), cudaMemcpyDeviceToHost, stream));
+      cudaCheck(cudaStreamSynchronize(stream));
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cudautils::make_host_unique<int[]>(N, stream);
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cudautils::make_device_unique<int[]>(N, stream);
+      auto host = cudautils::make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cudautils::copyAsync(device, host_orig, N, stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), N * sizeof(int), cudaMemcpyDeviceToHost, stream));
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = 200 + i;
+      }
+
+      SECTION("Copy some") {
+        cudautils::copyAsync(device, host_orig, 42, stream);
+        cudaCheck(cudaMemcpyAsync(host.get(), device.get(), 42 * sizeof(int), cudaMemcpyDeviceToHost, stream));
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < 42; ++i) {
+          CHECK(host[i] == 200 + i);
+        }
+      }
+    }
+  }
+
+  SECTION("Device to host") {
+    SECTION("Single element") {
+      auto host_orig = cudautils::make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cudautils::make_device_unique<int>(stream);
+      auto host = cudautils::make_host_unique<int>(stream);
+
+      cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), sizeof(int), cudaMemcpyHostToDevice, stream));
+      cudautils::copyAsync(host, device, stream);
+      cudaCheck(cudaStreamSynchronize(stream));
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cudautils::make_host_unique<int[]>(N, stream);
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cudautils::make_device_unique<int[]>(N, stream);
+      auto host = cudautils::make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), N * sizeof(int), cudaMemcpyHostToDevice, stream));
+        cudautils::copyAsync(host, device, N, stream);
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for (int i = 0; i < N; ++i) {
+        host_orig[i] = 200 + i;
+      }
+
+      SECTION("Copy some") {
+        cudaCheck(cudaMemcpyAsync(device.get(), host_orig.get(), 42 * sizeof(int), cudaMemcpyHostToDevice, stream));
+        cudautils::copyAsync(host, device, 42, stream);
+        cudaCheck(cudaStreamSynchronize(stream));
+        for (int i = 0; i < 42; ++i) {
+          CHECK(host[i] == 200 + i);
+        }
+      }
+    }
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..b3decf337cfa0
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/device_unique_ptr_t.cpp
@@ -0,0 +1,42 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+TEST_CASE("device_unique_ptr", "[cudaMemTools]") {
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Single element") {
+    auto ptr = cudautils::make_device_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Reset") {
+    auto ptr = cudautils::make_device_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr = cudautils::make_device_unique<int[]>(10, stream);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Allocating too much") {
+    constexpr size_t maxSize = 1 << 30;  // 8**10
+    auto ptr = cudautils::make_device_unique<char[]>(maxSize, stream);
+    ptr.reset();
+    REQUIRE_THROWS(ptr = cudautils::make_device_unique<char[]>(maxSize + 1, stream));
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..12f7bb239023b
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
@@ -0,0 +1,22 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
+  requireCUDADevices();
+
+  SECTION("Single element") {
+    auto ptr1 = cudautils::make_host_noncached_unique<int>();
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cudautils::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr1 = cudautils::make_host_noncached_unique<int[]>(10);
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cudautils::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..2ba9fd5aefc1c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/host_unique_ptr_t.cpp
@@ -0,0 +1,41 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+TEST_CASE("host_unique_ptr", "[cudaMemTools]") {
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Single element") {
+    auto ptr = cudautils::make_host_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Reset") {
+    auto ptr = cudautils::make_host_unique<int>(stream);
+    REQUIRE(ptr != nullptr);
+
+    ptr.reset();
+    REQUIRE(ptr.get() == nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr = cudautils::make_host_unique<int[]>(10, stream);
+    REQUIRE(ptr != nullptr);
+  }
+
+  SECTION("Allocating too much") {
+    constexpr size_t maxSize = 1 << 30;  // 8**10
+    auto ptr = cudautils::make_host_unique<char[]>(maxSize, stream);
+    ptr.reset();
+    REQUIRE_THROWS(ptr = cudautils::make_host_unique<char[]>(maxSize + 1, stream));
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
new file mode 100644
index 0000000000000..df4fbf52adb3d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
@@ -0,0 +1,53 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/requireCUDADevices.h"
+
+TEST_CASE("memsetAsync", "[cudaMemTools]") {
+  if (not hasCUDADevices()) {
+    return;
+  }
+
+  cudaStream_t stream;
+  cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  SECTION("Single element") {
+    auto host_orig = cudautils::make_host_unique<int>(stream);
+    *host_orig = 42;
+
+    auto device = cudautils::make_device_unique<int>(stream);
+    auto host = cudautils::make_host_unique<int>(stream);
+    cudautils::copyAsync(device, host_orig, stream);
+    cudautils::memsetAsync(device, 0, stream);
+    cudautils::copyAsync(host, device, stream);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    REQUIRE(*host == 0);
+  }
+
+  SECTION("Multiple elements") {
+    constexpr int N = 100;
+
+    auto host_orig = cudautils::make_host_unique<int[]>(N, stream);
+    for (int i = 0; i < N; ++i) {
+      host_orig[i] = i;
+    }
+
+    auto device = cudautils::make_device_unique<int[]>(N, stream);
+    auto host = cudautils::make_host_unique<int[]>(N, stream);
+    cudautils::copyAsync(device, host_orig, N, stream);
+    cudautils::memsetAsync(device, 0, N, stream);
+    cudautils::copyAsync(host, device, N, stream);
+    cudaCheck(cudaStreamSynchronize(stream));
+
+    for (int i = 0; i < N; ++i) {
+      CHECK(host[i] == 0);
+    }
+  }
+
+  cudaCheck(cudaStreamDestroy(stream));
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"