From c3773f7568e7e16520c50a2f053eee892d7c1415 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 22 Nov 2023 21:18:21 +0100
Subject: [PATCH 1/9] Add specialization of CopyToDevice for alpaka host buffer

---
 .../AlpakaInterface/interface/CopyToDevice.h  | 31 +++++++++
 .../AlpakaInterface/test/BuildFile.xml        |  7 ++
 .../test/alpaka/testCopyBufferToDevice.dev.cc | 68 +++++++++++++++++++
 3 files changed, 106 insertions(+)
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc

diff --git a/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h b/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h
index 8fad8d0729f1a..f98d475185ff3 100644
--- a/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h
+++ b/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h
@@ -32,4 +32,35 @@ namespace cms::alpakatools {
   struct CopyToDevice;
 }  // namespace cms::alpakatools
 
+// specialize to Alpaka buffer
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+namespace cms::alpakatools {
+  // Note: can't do partial specializations along
+  // - CopyToDevice<host_buffer<TObject>>
+  // - CopyToDevice<alpaka::Buf<alpaka_common::DevHost, TObject, alpaka_common::Dim0D, alpaka_common::Idx>e
+  // because both host_buffer and alpaka::Buf use trait-style
+  // indirection that prevents template argument type deduction
+  template <typename TObject>
+  struct CopyToDevice<alpaka::BufCpu<TObject, alpaka_common::Dim0D, alpaka_common::Idx>> {
+    template <typename TQueue>
+    static auto copyAsync(TQueue& queue, host_buffer<TObject> const& src) {
+      using TDevice = alpaka::Dev<TQueue>;
+      auto dst = make_device_buffer<TObject>(queue);
+      alpaka::memcpy(queue, dst, src);
+      return dst;
+    }
+  };
+
+  template <typename TObject>
+  struct CopyToDevice<alpaka::BufCpu<TObject, alpaka_common::Dim1D, alpaka_common::Idx>> {
+    template <typename TQueue>
+    static auto copyAsync(TQueue& queue, host_buffer<TObject[]> const& src) {
+      using TDevice = alpaka::Dev<TQueue>;
+      auto dst = make_device_buffer<TObject[]>(queue, alpaka::getExtentProduct(src));
+      alpaka::memcpy(queue, dst, src);
+      return dst;
+    }
+  };
+}  // namespace cms::alpakatools
+
 #endif
diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
index 426a750e3d0b9..77aa10cc5a171 100644
--- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -19,6 +19,13 @@
   <flags ALPAKA_BACKENDS="1"/>
 </bin>
 
+<bin name="alpakaCopyBufferToDevice" file="alpaka/testCopyBufferToDevice.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
 <bin name="alpakaTestBackend" file="testBackend.cc">
   <use name="catch2"/>
   <use name="HeterogeneousCore/AlpakaInterface"/>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc
new file mode 100644
index 0000000000000..209c01a6f641f
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc
@@ -0,0 +1,68 @@
+#include <alpaka/alpaka.hpp>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h"
+
+// each test binary is built for a single Alpaka backend
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+namespace {
+  struct Dummy {
+    int x, y, z;
+  };
+}
+
+TEST_CASE("Test CopyToDevice for Alpaka buffers for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
+          "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
+  SECTION("Buffer of scalar") {
+    auto buffer_host = cms::alpakatools::make_host_buffer<Dummy>();
+
+    // run the test on each device
+    for (auto const& device : cms::alpakatools::devices<Platform>()) {
+      auto queue = Queue(device);
+      using Copy = cms::alpakatools::CopyToDevice<decltype(buffer_host)>;
+      auto buffer_device = Copy::copyAsync(queue, buffer_host);
+      alpaka::wait(queue);
+    }
+  }
+
+  SECTION("Buffer of array with static size") {
+    // The buffer itself is really dynamically sized, even if the
+    // alpakatools API looks like the array would have static size
+    constexpr int N = 10;
+    auto buffer_host = cms::alpakatools::make_host_buffer<int[N]>();
+    for (int i = 0; i < N; ++i) {
+      buffer_host[i] = i;
+    }
+
+    // run the test on each device
+    for (auto const& device : cms::alpakatools::devices<Platform>()) {
+      auto queue = Queue(device);
+      using Copy = cms::alpakatools::CopyToDevice<decltype(buffer_host)>;
+      auto buffer_device = Copy::copyAsync(queue, buffer_host);
+      alpaka::wait(queue);
+      REQUIRE(alpaka::getExtentProduct(buffer_device) == N);
+    }
+  }
+
+  SECTION("Buffer of array with dynamic size") {
+    constexpr int N = 10;
+    auto buffer_host = cms::alpakatools::make_host_buffer<int[]>(N);
+    for (int i = 0; i < N; ++i) {
+      buffer_host[i] = i;
+    }
+
+    // run the test on each device
+    for (auto const& device : cms::alpakatools::devices<Platform>()) {
+      auto queue = Queue(device);
+      using Copy = cms::alpakatools::CopyToDevice<decltype(buffer_host)>;
+      auto buffer_device = Copy::copyAsync(queue, buffer_host);
+      alpaka::wait(queue);
+      REQUIRE(alpaka::getExtentProduct(buffer_device) == N);
+    }
+  }
+}

From 59de41482ea0868dc4318cb5c1dfa4e84139448c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 22 Nov 2023 18:51:36 +0100
Subject: [PATCH 2/9] Add CopyToDeviceCache class template

---
 .../AlpakaCore/interface/CopyToDeviceCache.h  | 111 ++++++++++++
 .../test/alpaka/testCopyBufferToDevice.dev.cc |   2 +-
 .../AlpakaTest/plugins/alpaka/TestAlgo.dev.cc | 161 ++++++++++++++++++
 .../AlpakaTest/plugins/alpaka/TestAlgo.h      |  14 ++
 ...stAlpakaGlobalProducerCopyToDeviceCache.cc |  83 +++++++++
 .../AlpakaTest/test/testAlpakaModules_cfg.py  |  18 +-
 6 files changed, 387 insertions(+), 2 deletions(-)
 create mode 100644 HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h
 create mode 100644 HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc

diff --git a/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h
new file mode 100644
index 0000000000000..f33c8c170c90c
--- /dev/null
+++ b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h
@@ -0,0 +1,111 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToDeviceCache_h
+#define HeterogeneousCore_AlpakaInterface_interface_CopyToDeviceCache_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaCore/interface/QueueCache.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+
+namespace cms::alpakatools {
+  namespace detail {
+    // By default copy the host object with CopyToDevice<T>
+    //
+    // Doing with template specialization (rather than
+    // std::conditional_t and if constexpr) because the
+    // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for
+    // PortableCollection on host device
+    template <typename TDev, typename TQueue, typename THostObject>
+    class CopyToDeviceCacheImpl {
+    public:
+      using Device = TDev;
+      using Queue = TQueue;
+      using HostObject = THostObject;
+      using Copy = CopyToDevice<HostObject>;
+      using DeviceObject = decltype(Copy::copyAsync(std::declval<Queue&>(), std::declval<HostObject const&>()));
+
+      CopyToDeviceCacheImpl(HostObject const& srcObject) {
+        using Platform = alpaka::Platform<Device>;
+        auto const& devices = cms::alpakatools::devices<Platform>();
+        std::vector<std::shared_ptr<Queue>> queues;
+        queues.reserve(devices.size());
+        data_.reserve(devices.size());
+        for (auto const& dev : devices) {
+          auto queue = getQueueCache<Queue>().get(dev);
+          data_.emplace_back(Copy::copyAsync(*queue, srcObject));
+          queues.emplace_back(std::move(queue));
+        }
+        for (auto& queuePtr : queues) {
+          alpaka::wait(*queuePtr);
+        }
+      }
+
+      DeviceObject const& get(size_t i) const { return data_[i]; }
+
+    private:
+      std::vector<DeviceObject> data_;
+    };
+
+    // For host device, copy the host object directly instead
+    template <typename TQueue, typename THostObject>
+    class CopyToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> {
+    public:
+      using HostObject = THostObject;
+      using DeviceObject = HostObject;
+
+      CopyToDeviceCacheImpl(HostObject const& srcObject) : data_(srcObject) {}
+
+      DeviceObject const& get(size_t i) const { return data_; }
+
+    private:
+      HostObject data_;
+    };
+  }  // namespace detail
+
+  /**
+   * This class template implements a cache for data that is copied
+   * from the host (of type THostObject) to all the devices
+   * corresponding the TQueue queue type.
+   *
+   * The host-side object to be copied is given as an argument to the
+   * class constructor. The constructor uses the
+   * CopyToDevice<THostObject> class template to perfom the copy, and
+   * waits for the data copies to finish, i.e. the constructor is
+   * synchronous wrt. the data copies.
+   *
+   * The device-side object corresponding to the THostObject (actual
+   * type is the return type of CopyToDevice<THostObject>::copyAsync())
+   * can be obtained with get() member function, that has either the
+   * queue or device argument.
+   *
+   * TODO: In principle it would be better to template over Device,
+   * but then we'd need a way to have a "default queue" type for each
+   * Device in order to infer the return type of
+   * CopyToDevice::copyAsync(). Alternatively, the template over
+   * TQueue could be removed by moving the class definition to
+   * ALPAKA_ACCELERATOR_NAMESPACE.
+   */
+  template <typename TQueue, typename THostObject>
+  class CopyToDeviceCache {
+    using Queue = TQueue;
+    using Device = alpaka::Dev<Queue>;
+    using HostObject = THostObject;
+    using Impl = detail::CopyToDeviceCacheImpl<Device, Queue, HostObject>;
+    using DeviceObject = typename Impl::DeviceObject;
+
+  public:
+    CopyToDeviceCache(THostObject const& srcData) : data_(srcData) {}
+
+    // TODO: I could make this function to return the contained object
+    // in case of alpaka buffer, PortableObject, or PortableCollection
+    // (in PortableCollection case it would be the View)
+    DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); }
+
+    DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); }
+
+  private:
+    Impl data_;
+  };
+}  // namespace cms::alpakatools
+
+#endif
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc
index 209c01a6f641f..b4bbb2868412e 100644
--- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc
@@ -14,7 +14,7 @@ namespace {
   struct Dummy {
     int x, y, z;
   };
-}
+}  // namespace
 
 TEST_CASE("Test CopyToDevice for Alpaka buffers for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
           "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
index 3afbc3d9d8103..a9034ead09e0f 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
@@ -176,6 +176,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
       }
     }
+
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                  portabletest::TestDeviceCollection::ConstView input,
+                                  TestAlgo::UpdateInfo const* updateInfo,
+                                  portabletest::TestDeviceCollection::View output) const {
+      // set this only once in the whole kernel grid
+      if (once_per_grid(acc)) {
+        output.r() = input.r();
+      }
+
+      // make a strided loop over the kernel grid, covering up to "size" elements
+      for (int32_t i : uniform_elements(acc, output.metadata().size())) {
+        double x = input[i].x();
+        x += updateInfo->x;
+        output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
+      }
+    }
   };
 
   class TestAlgoKernelUpdateMulti2 {
@@ -209,6 +227,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
       }
     }
+
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                  portabletest::TestSoA::ConstView input,
+                                  portabletest::TestSoA2::ConstView input2,
+                                  TestAlgo::UpdateInfo const* updateInfo,
+                                  portabletest::TestSoA::View output,
+                                  portabletest::TestSoA2::View output2) const {
+      // set this only once in the whole kernel grid
+      if (once_per_grid(acc)) {
+        output.r() = input.r();
+        output2.r2() = input2.r2();
+      }
+
+      // make a strided loop over the kernel grid, covering up to "size" elements
+      for (int32_t i : uniform_elements(acc, output.metadata().size())) {
+        double x = input[i].x();
+        x += updateInfo->x;
+        output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
+      }
+      for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
+        double x2 = input2[i].x2();
+        x2 += updateInfo->x;
+        output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
+      }
+    }
   };
 
   class TestAlgoKernelUpdateMulti3 {
@@ -254,6 +298,42 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
         output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()};
       }
     }
+
+    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
+    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                  portabletest::TestSoA::ConstView input,
+                                  portabletest::TestSoA2::ConstView input2,
+                                  portabletest::TestSoA3::ConstView input3,
+                                  TestAlgo::UpdateInfo const* updateInfo,
+                                  portabletest::TestSoA::View output,
+                                  portabletest::TestSoA2::View output2,
+                                  portabletest::TestSoA3::View output3) const {
+      // set this only once in the whole kernel grid
+      if (once_per_grid(acc)) {
+        output.r() = input.r();
+        output2.r2() = input2.r2();
+        output3.r3() = input3.r3();
+      }
+
+      // make a strided loop over the kernel grid, covering up to "size" elements
+      for (int32_t i : uniform_elements(acc, output.metadata().size())) {
+        double x = input[i].x();
+        x += updateInfo->x;
+        if (0 == i)
+          printf("Setting x[0] to %f\n", x);
+        output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()};
+      }
+      for (int32_t i : uniform_elements(acc, output2.metadata().size())) {
+        double x2 = input2[i].x2();
+        x2 += updateInfo->x;
+        output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()};
+      }
+      for (int32_t i : uniform_elements(acc, output3.metadata().size())) {
+        double x3 = input3[i].x3();
+        x3 += updateInfo->x;
+        output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()};
+      }
+    }
   };
 
   portabletest::TestDeviceCollection TestAlgo::update(Queue& queue,
@@ -337,6 +417,87 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     return collection;
   }
 
+  portabletest::TestDeviceCollection TestAlgo::update(Queue& queue,
+                                                      portabletest::TestDeviceCollection const& input,
+                                                      UpdateInfo const* d_updateInfo) const {
+    portabletest::TestDeviceCollection collection{input->metadata().size(), queue};
+
+    // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
+    uint32_t items = 64;
+
+    // use as many groups as needed to cover the whole problem
+    uint32_t groups = divide_up_by(collection->metadata().size(), items);
+
+    // map items to
+    //   - threads with a single element per thread on a GPU backend
+    //   - elements within a single thread on a CPU backend
+    auto workDiv = make_workdiv<Acc1D>(groups, items);
+
+    alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernelUpdate{}, input.view(), d_updateInfo, collection.view());
+
+    return collection;
+  }
+
+  portabletest::TestDeviceMultiCollection2 TestAlgo::updateMulti2(Queue& queue,
+                                                                  portabletest::TestDeviceMultiCollection2 const& input,
+                                                                  UpdateInfo const* d_updateInfo) const {
+    portabletest::TestDeviceMultiCollection2 collection{input.sizes(), queue};
+
+    // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
+    uint32_t items = 64;
+
+    // use as many groups as needed to cover the whole problem
+    auto sizes = collection.sizes();
+    uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
+
+    // map items to
+    //   - threads with a single element per thread on a GPU backend
+    //   - elements within a single thread on a CPU backend
+    auto workDiv = make_workdiv<Acc1D>(groups, items);
+
+    alpaka::exec<Acc1D>(queue,
+                        workDiv,
+                        TestAlgoKernelUpdateMulti2{},
+                        input.view<portabletest::TestSoA>(),
+                        input.view<portabletest::TestSoA2>(),
+                        d_updateInfo,
+                        collection.view<portabletest::TestSoA>(),
+                        collection.view<portabletest::TestSoA2>());
+
+    return collection;
+  }
+
+  portabletest::TestDeviceMultiCollection3 TestAlgo::updateMulti3(Queue& queue,
+                                                                  portabletest::TestDeviceMultiCollection3 const& input,
+                                                                  UpdateInfo const* d_updateInfo) const {
+    portabletest::TestDeviceMultiCollection3 collection{input.sizes(), queue};
+
+    // use 64 items per group (this value is arbitrary, but it's a reasonable starting point)
+    uint32_t items = 64;
+
+    // use as many groups as needed to cover the whole problem
+    auto sizes = collection.sizes();
+    uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items);
+
+    // map items to
+    //   - threads with a single element per thread on a GPU backend
+    //   - elements within a single thread on a CPU backend
+    auto workDiv = make_workdiv<Acc1D>(groups, items);
+
+    alpaka::exec<Acc1D>(queue,
+                        workDiv,
+                        TestAlgoKernelUpdateMulti3{},
+                        input.view<portabletest::TestSoA>(),
+                        input.view<portabletest::TestSoA2>(),
+                        input.view<portabletest::TestSoA3>(),
+                        d_updateInfo,
+                        collection.view<portabletest::TestSoA>(),
+                        collection.view<portabletest::TestSoA2>(),
+                        collection.view<portabletest::TestSoA3>());
+
+    return collection;
+  }
+
   class TestZeroCollectionKernel {
   public:
     template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h
index f943eacddd1c3..dbebf60e898b5 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h
@@ -17,6 +17,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     portabletest::TestDeviceCollection update(Queue& queue,
                                               portabletest::TestDeviceCollection const& input,
                                               AlpakaESTestDataEDevice const& esData) const;
+
     portabletest::TestDeviceMultiCollection2 updateMulti2(Queue& queue,
                                                           portabletest::TestDeviceMultiCollection2 const& input,
                                                           AlpakaESTestDataEDevice const& esData) const;
@@ -24,6 +25,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
                                                           portabletest::TestDeviceMultiCollection3 const& input,
                                                           AlpakaESTestDataEDevice const& esData) const;
 
+    struct UpdateInfo {
+      int x, y, z;
+    };
+    portabletest::TestDeviceCollection update(Queue& queue,
+                                              portabletest::TestDeviceCollection const& input,
+                                              UpdateInfo const* d_updateInfo) const;
+    portabletest::TestDeviceMultiCollection2 updateMulti2(Queue& queue,
+                                                          portabletest::TestDeviceMultiCollection2 const& input,
+                                                          UpdateInfo const* d_updateInfo) const;
+    portabletest::TestDeviceMultiCollection3 updateMulti3(Queue& queue,
+                                                          portabletest::TestDeviceMultiCollection3 const& input,
+                                                          UpdateInfo const* d_updateInfo) const;
+
     void fillMulti2(Queue& queue, portabletest::TestDeviceMultiCollection2& collection, double xvalue = 0.) const;
     void fillMulti3(Queue& queue, portabletest::TestDeviceMultiCollection3& collection, double xvalue = 0.) const;
 
diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc
new file mode 100644
index 0000000000000..331b85d093bda
--- /dev/null
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc
@@ -0,0 +1,83 @@
+#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h"
+#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+#include "TestAlgo.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  /**
+   * This class demonstrates a global EDProducer that
+   * - uses a CopyToDeviceCache to copy some host-side data to the devices of the backend.
+   * - produces a device EDProduct (that can get transferred to host automatically)
+   */
+  class TestAlpakaGlobalProducerCopyToDeviceCache : public global::EDProducer<> {
+  public:
+    TestAlpakaGlobalProducerCopyToDeviceCache(edm::ParameterSet const& config)
+        : getToken_(consumes(config.getParameter<edm::InputTag>("source"))),
+          getTokenMulti2_(consumes(config.getParameter<edm::InputTag>("source"))),
+          getTokenMulti3_(consumes(config.getParameter<edm::InputTag>("source"))),
+          putToken_{produces()},
+          putTokenMulti2_{produces()},
+          putTokenMulti3_{produces()},
+          // create host-side object that gets implicitly copied to all devices of the backend
+          deviceCache_{[&config]() {
+            auto buffer = cms::alpakatools::make_host_buffer<TestAlgo::UpdateInfo>();
+            *buffer = TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"),
+                                           config.getParameter<int32_t>("y"),
+                                           config.getParameter<int32_t>("z")};
+            return buffer;
+          }()} {}
+
+    void produce(edm::StreamID, device::Event& iEvent, device::EventSetup const& iSetup) const override {
+      auto const& input = iEvent.get(getToken_);
+      auto const& inputMulti2 = iEvent.get(getTokenMulti2_);
+      auto const& inputMulti3 = iEvent.get(getTokenMulti3_);
+
+      // get the object corresponding to the Device the Event is being processed on
+      auto const& infoBuffer = deviceCache_.get(iEvent.queue());
+
+      // run the algorithm, potentially asynchronously
+      auto deviceProduct = algo_.update(iEvent.queue(), input, infoBuffer.data());
+      auto deviceProductMulti2 = algo_.updateMulti2(iEvent.queue(), inputMulti2, infoBuffer.data());
+      auto deviceProductMulti3 = algo_.updateMulti3(iEvent.queue(), inputMulti3, infoBuffer.data());
+
+      iEvent.emplace(putToken_, std::move(deviceProduct));
+      iEvent.emplace(putTokenMulti2_, std::move(deviceProductMulti2));
+      iEvent.emplace(putTokenMulti3_, std::move(deviceProductMulti3));
+    }
+
+    static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+      edm::ParameterSetDescription desc;
+
+      desc.add("source", edm::InputTag{});
+      desc.add<int32_t>("x", 0);
+      desc.add<int32_t>("y", 1);
+      desc.add<int32_t>("z", 2);
+
+      descriptions.addWithDefaultLabel(desc);
+    }
+
+  private:
+    const device::EDGetToken<portabletest::TestDeviceCollection> getToken_;
+    const device::EDGetToken<portabletest::TestDeviceMultiCollection2> getTokenMulti2_;
+    const device::EDGetToken<portabletest::TestDeviceMultiCollection3> getTokenMulti3_;
+    const device::EDPutToken<portabletest::TestDeviceCollection> putToken_;
+    const device::EDPutToken<portabletest::TestDeviceMultiCollection2> putTokenMulti2_;
+    const device::EDPutToken<portabletest::TestDeviceMultiCollection3> putTokenMulti3_;
+
+    // implementation of the algorithm
+    TestAlgo algo_;
+
+    cms::alpakatools::CopyToDeviceCache<Queue, cms::alpakatools::host_buffer<TestAlgo::UpdateInfo>> deviceCache_;
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h"
+DEFINE_FWK_ALPAKA_MODULE(TestAlpakaGlobalProducerCopyToDeviceCache);
diff --git a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py
index 62279b26b3010..bdacd11d7e4c2 100644
--- a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py
+++ b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py
@@ -78,6 +78,12 @@
 process.alpakaGlobalProducerE = cms.EDProducer("TestAlpakaGlobalProducerE@alpaka",
     source = cms.InputTag("alpakaGlobalProducer")
 )
+process.alpakaGlobalProducerCopyToDeviceCache = cms.EDProducer("TestAlpakaGlobalProducerCopyToDeviceCache@alpaka",
+    source = cms.InputTag("alpakaGlobalProducer"),
+    x = cms.int32(3),
+    y = cms.int32(4),
+    z = cms.int32(5),
+)
 process.alpakaStreamProducer = cms.EDProducer("TestAlpakaStreamProducer@alpaka",
     source = cms.InputTag("intProduct"),
     eventSetupSource = cms.ESInputTag("alpakaESProducerB", "explicitLabel"),
@@ -122,6 +128,10 @@
     source = "alpakaGlobalProducerE",
     expectXvalues = cms.vdouble([(i%2)*10+1 + abs(27)+i*2 for i in range(0,5)] + [0]*5)
 )
+process.alpakaGlobalConsumerCopyToDeviceCache = process.alpakaGlobalConsumer.clone(
+    source = "alpakaGlobalProducerCopyToDeviceCache",
+    expectXvalues = cms.vdouble([3]*10)
+)
 process.alpakaStreamConsumer = cms.EDAnalyzer("TestAlpakaAnalyzer",
     source = cms.InputTag("alpakaStreamProducer"),
     expectSize = cms.int32(5),
@@ -153,7 +163,7 @@
 if args.moduleBackend != "":
     for name in ["ESProducerA", "ESProducerB", "ESProducerC", "ESProducerD", "ESProducerE", "ESProducerAMulti",
                  "ESProducerNull",
-                 "GlobalProducer", "GlobalProducerE",
+                 "GlobalProducer", "GlobalProducerE", "GlobalProducerCopyToDeviceCache",
                  "StreamProducer", "StreamInstanceProducer",
                  "StreamSynchronizingProducer", "StreamSynchronizingProducerToDevice",
                  "GlobalDeviceConsumer", "StreamDeviceConsumer",
@@ -168,6 +178,8 @@ def setExpect(m, size):
     setExpect(process.alpakaGlobalConsumer, size=20)
     setExpect(process.alpakaGlobalConsumerE, size=20)
     process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(20-10))
+    setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size=20)
+    process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*20
     setExpect(process.alpakaStreamConsumer, size=25)
     setExpect(process.alpakaStreamInstanceConsumer, size=36)
     setExpect(process.alpakaStreamSynchronizingConsumer, size=20)
@@ -178,6 +190,8 @@ def setExpect(m, size):
     setExpect(process.alpakaGlobalConsumer, size = 30)
     setExpect(process.alpakaGlobalConsumerE, size = 30)
     process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(30-10))
+    setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size = 30)
+    process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*30
     setExpect(process.alpakaStreamConsumer, size = 125)
     setExpect(process.alpakaStreamInstanceConsumer, size = 216)
     setExpect(process.alpakaStreamSynchronizingConsumer, size = 30)
@@ -196,6 +210,7 @@ def setExpect(m, size):
     process.intProduct,
     process.alpakaGlobalProducer,
     process.alpakaGlobalProducerE,
+    process.alpakaGlobalProducerCopyToDeviceCache,
     process.alpakaStreamProducer,
     process.alpakaStreamInstanceProducer,
     process.alpakaStreamSynchronizingProducer,
@@ -205,6 +220,7 @@ def setExpect(m, size):
     process.alpakaGlobalConsumer+
     process.alpakaGlobalDeviceConsumer+
     process.alpakaGlobalConsumerE+
+    process.alpakaGlobalConsumerCopyToDeviceCache+
     process.alpakaStreamConsumer+
     process.alpakaStreamDeviceConsumer+
     process.alpakaStreamInstanceConsumer+

From d70f43b9de634b457ffee1df43a90f01eba6bcb1 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 10 Jan 2024 23:12:02 +0100
Subject: [PATCH 3/9] Add moveToDeviceAsync function

---
 HeterogeneousCore/AlpakaInterface/README.md   |  38 +++++-
 .../interface/moveToDeviceAsync.h             |  47 +++++++
 .../AlpakaInterface/test/BuildFile.xml        |   7 +
 .../test/alpaka/testMoveToDeviceAsync.dev.cc  | 121 ++++++++++++++++++
 4 files changed, 212 insertions(+), 1 deletion(-)
 create mode 100644 HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc

diff --git a/HeterogeneousCore/AlpakaInterface/README.md b/HeterogeneousCore/AlpakaInterface/README.md
index 3d90abdf5a2b5..e6fb83eb41d19 100644
--- a/HeterogeneousCore/AlpakaInterface/README.md
+++ b/HeterogeneousCore/AlpakaInterface/README.md
@@ -137,7 +137,9 @@ See the previous section for considerations about the use of device-mapped
 memory.
 
 
-## A note about copies and synchronisation
+## Notes about copies and synchronisation
+
+### Host-to-device copy
 
 When copying data from a host buffer to a device buffer, _e.g._ with
 ```c++
@@ -163,6 +165,40 @@ std::memset(a_host_buffer.data(), 0x00, size);
 is likely to overwrite part of the buffer while the copy is still ongoing,
 resulting in `a_device_buffer` with incomplete and corrupted contents.
 
+### Host-to-device move
+
+For host data types that are movable and not copyable one can, to
+large degree, avoid worrying about the caveats above about avoiding
+any operations on the host with the following utility and move semantics
+```c++
+#include "HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h"
+// ...
+auto device_object = cms::alpakatools::moveToDeviceAsync(queue, std::move(host_object));
+```
+
+Here the host-side `host_object` is _moved_ to the
+`moveToDeviceAsync()` function, which returns a correponding
+device-side `device_object`. In this case any subsequent use of
+`host_object` is clearly "use after move", which is easier to catch in
+code review or by static analysis tools than the consequences of
+`alpaka::mempcy()`.
+
+The `cms::alpakatools::CopyToDevice<T>` class temlate must have a
+specialization for the host data type (otherwise the compilation will fail).
+
+As mentioned above, the host data type must be movable but not
+copyable (the compilation will fail with copyable types). For example,
+the `PortableHostCollection` and `PortableHostObject` class templates
+can be used, but Alpaka buffers can not be directly used.
+
+The host data object should manage memory in
+[queue-ordered](#allocating-queue-ordered-host-buffers-in-device-mapped-memory)
+way. If not, the object must synchronize the device and the host in
+its destructor (although such synchronization is undesirable).
+Otherwise, the behavior is undefined.
+
+### Device-to-host copy
+
 When copying data from a device buffer to a host buffer, _e.g._ with
 ```c++
 alpaka::memcpy(queue, a_host_buffer, a_device_buffer);
diff --git a/HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h b/HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h
new file mode 100644
index 0000000000000..25aab77685163
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h
@@ -0,0 +1,47 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_moveToDeviceAsync_h
+#define HeterogeneousCore_AlpakaInterface_interface_moveToDeviceAsync_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h"
+
+namespace cms::alpakatools {
+  /**
+   * This function moves the argument hostObject object to the device
+   * specified by the queue. Here the "move" means that the argument
+   * host object must not be used in the caller after this function
+   * has been called.
+   *
+   * The CopyToDevice class template is used to define the returned
+   * device object that corresponds the argument host object. For host
+   * device the copying is skipped, and the hostData is returned directly.
+   *
+   * The host object must either
+   * - allocate its memory in queue-ordered way (e.g. using make_host_buffer(TQueue, ...)), or
+   * - synchronize in its destructor (makes this function synchronous, so not preferred)
+   * If the host object uses non-queue-order-allocated memory, and
+   * does not synchronize in its destructor, behavior is undefined.
+   *
+   * Note that the host object type is required to be non-copyable.
+   * This is to avoid easy mistakes with objects that follow copy
+   * semantics of std::shared_ptr (that includes Alpaka buffers), that
+   * would allow the source memory buffer to be used via another copy
+   * during the asynchronous data copy to the device.
+   */
+  template <typename TQueue, typename THostObject, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
+  auto moveToDeviceAsync(TQueue& queue, THostObject hostObject) {
+    static_assert(not(std::is_copy_constructible_v<THostObject> or std::is_copy_assignable_v<THostObject>),
+                  "The data object to be moved to device must not be copyable.");
+
+    if constexpr (std::is_same_v<alpaka::Dev<TQueue>, alpaka_common::DevHost>) {
+      return hostObject;
+    } else {
+      return CopyToDevice<THostObject>::copyAsync(queue, hostObject);
+    }
+  }
+}  // namespace cms::alpakatools
+
+#endif
diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
index 77aa10cc5a171..4c0d1ffff0b27 100644
--- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -26,6 +26,13 @@
   <flags ALPAKA_BACKENDS="1"/>
 </bin>
 
+<bin name="alpakaMoveToDeviceAsync" file="alpaka/testMoveToDeviceAsync.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
 <bin name="alpakaTestBackend" file="testBackend.cc">
   <use name="catch2"/>
   <use name="HeterogeneousCore/AlpakaInterface"/>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc
new file mode 100644
index 0000000000000..1e3b7f80b318f
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc
@@ -0,0 +1,121 @@
+#include <optional>
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+// each test binary is built for a single Alpaka backend
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+namespace {
+  template <typename T>
+  class TestHostBuffer {
+  public:
+    using Buffer = cms::alpakatools::host_buffer<T[]>;
+    using ConstBuffer = cms::alpakatools::const_host_buffer<T[]>;
+
+    template <typename TQueue>
+    TestHostBuffer(TQueue const& queue, int size) : buffer_(cms::alpakatools::make_host_buffer<T[]>(queue, size)) {}
+
+    TestHostBuffer(TestHostBuffer const&) = delete;
+    TestHostBuffer& operator=(TestHostBuffer const&) = delete;
+    ;
+    TestHostBuffer(TestHostBuffer&& other) {
+      buffer_ = std::move(*other.buffer_);
+      other.buffer_.reset();
+    }
+    TestHostBuffer& operator=(TestHostBuffer& other) {
+      buffer_ = std::move(*other.buffer_);
+      other.buffer_.reset();
+      return this;
+    }
+
+    bool has_value() const { return buffer_.has_value(); }
+
+    T* data() { return buffer_->data(); }
+
+    Buffer buffer() { return *buffer_; }
+    ConstBuffer buffer() const { return *buffer_; }
+
+  private:
+    std::optional<Buffer> buffer_;
+  };
+
+  template <typename T, typename TDev>
+  class TestDeviceBuffer {
+  public:
+    using Buffer = cms::alpakatools::device_buffer<TDev, T[]>;
+
+    template <typename TQueue>
+    TestDeviceBuffer(TQueue const& queue, int size) : buffer_(cms::alpakatools::make_device_buffer<T[]>(queue, size)) {}
+
+    T* data() { return buffer_.data(); }
+
+    Buffer buffer() { return buffer_; }
+
+  private:
+    Buffer buffer_;
+  };
+
+  template <typename T>
+  void fillBuffer(TestHostBuffer<T>& buffer) {
+    for (int i = 0, size = alpaka::getExtentProduct(buffer.buffer()); i < size; ++i) {
+      buffer.data()[i] = i;
+    }
+  }
+}  // namespace
+
+namespace cms::alpakatools {
+  template <typename T>
+  struct CopyToDevice<TestHostBuffer<T>> {
+    template <typename TQueue>
+    static auto copyAsync(TQueue& queue, TestHostBuffer<T> const& hostBuffer) {
+      TestDeviceBuffer<T, alpaka::Dev<TQueue>> deviceBuffer(queue, alpaka::getExtentProduct(hostBuffer.buffer()));
+      alpaka::memcpy(queue, deviceBuffer.buffer(), hostBuffer.buffer());
+      return deviceBuffer;
+    }
+  };
+}  // namespace cms::alpakatools
+
+TEST_CASE("Test moveToDeviceAsync() for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
+          "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    auto queue = Queue(device);
+    constexpr int size = 32;
+    TestHostBuffer<int> buffer_host(queue, size);
+    fillBuffer(buffer_host);
+    auto const* ptr_host = buffer_host.data();
+
+    auto buffer_device = cms::alpakatools::moveToDeviceAsync(queue, std::move(buffer_host));
+    REQUIRE(not buffer_host.has_value());
+    if constexpr (std::is_same_v<Device, alpaka_common::DevHost>) {
+      REQUIRE(buffer_device.data() == ptr_host);
+    } else {
+      REQUIRE(buffer_device.data() != ptr_host);
+    }
+    alpaka::exec<Acc1D>(
+        queue,
+        cms::alpakatools::make_workdiv<Acc1D>(1, size),
+        [] ALPAKA_FN_ACC(Acc1D const& acc, int const* data) {
+          for (int i : cms::alpakatools::uniform_elements(acc)) {
+            assert(data[i] == i);
+          }
+        },
+        buffer_device.data());
+    alpaka::wait(queue);
+
+    /* the following should not compile
+    auto buffer2_host = cms::alpakatools::make_host_buffer<int>();
+    auto buffer2_device = cms::alpakatools::moveToDeviceAsync(queue, std::move(buffer2_host));
+    */
+  }
+}

From 60da57efd157612e118b46a5190ce9caed149290 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 12 Jan 2024 23:15:49 +0100
Subject: [PATCH 4/9] Add MoveToDeviceCache class template

---
 .../AlpakaCore/interface/MoveToDeviceCache.h  | 108 ++++++++++++++++++
 .../AlpakaTest/plugins/BuildFile.xml          |   1 +
 ...stAlpakaGlobalProducerMoveToDeviceCache.cc |  84 ++++++++++++++
 .../AlpakaTest/test/testAlpakaModules_cfg.py  |  19 ++-
 4 files changed, 211 insertions(+), 1 deletion(-)
 create mode 100644 HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h
 create mode 100644 HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc

diff --git a/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h
new file mode 100644
index 0000000000000..2f5bcbb765bac
--- /dev/null
+++ b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h
@@ -0,0 +1,108 @@
+#ifndef HeterogeneousCore_AlpakaInterface_interface_MoveToDeviceCache_h
+#define HeterogeneousCore_AlpakaInterface_interface_MoveToDeviceCache_h
+
+#include <type_traits>
+
+#include <alpaka/alpaka.hpp>
+
+#include "HeterogeneousCore/AlpakaCore/interface/QueueCache.h"
+#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/devices.h"
+
+namespace cms::alpakatools {
+  namespace detail {
+    // By default copy the host object with CopyToDevice<T>
+    //
+    // Doing with template specialization (rather than
+    // std::conditional_t and if constexpr) because the
+    // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for
+    // PortableCollection on host device
+    template <typename TDev, typename TQueue, typename THostObject>
+    class MoveToDeviceCacheImpl {
+    public:
+      using HostObject = THostObject;
+      using Impl = CopyToDeviceCacheImpl<TDev, TQueue, THostObject>;
+      using DeviceObject = typename Impl::DeviceObject;
+
+      MoveToDeviceCacheImpl(HostObject&& srcObject) : impl_(srcObject) {}
+
+      DeviceObject const& get(size_t i) const { return impl_.get(i); }
+
+    private:
+      Impl impl_;
+    };
+
+    // For host device, move the host object instead
+    template <typename TQueue, typename THostObject>
+    class MoveToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> {
+    public:
+      using HostObject = THostObject;
+      using DeviceObject = HostObject;
+
+      MoveToDeviceCacheImpl(HostObject&& srcObject) : data_(std::move(srcObject)) {}
+
+      DeviceObject const& get(size_t i) const { return data_; }
+
+    private:
+      HostObject data_;
+    };
+  }  // namespace detail
+
+  /**
+   * This class template implements a cache for data that is moved
+   * from the host (of type THostObject) to all the devices
+   * corresponding the TQueue queue type.
+   *
+   * The host-side object to be moved is given as an argument to the
+   * class constructor. The constructor uses the
+   * CopyToDevice<THostObject> class template to copy the data to the
+   * devices, and waits for the data copies to finish, i.e. the
+   * constructor is synchronous wrt. the data copies. The "move" is
+   * achieved by requiring the constructor argument to the rvalue
+   * reference.
+   *
+   * Note that the host object type is required to be non-copyable.
+   * This is to avoid easy mistakes with objects that follow copy
+   * semantics of std::shared_ptr (that includes Alpaka buffers), that
+   * would allow the source memory buffer to be used via another copy
+   * during the asynchronous data copy to the device.
+   *
+   * The device-side object corresponding to the THostObject (actual
+   * type is the return type of CopyToDevice<THostObject>::copyAsync())
+   * can be obtained with get() member function, that has either the
+   * queue or device argument.
+   *
+   * TODO: In principle it would be better to template over Device,
+   * but then we'd need a way to have a "default queue" type for each
+   * Device in order to infer the return type of
+   * CopyToDevice::copyAsync(). Alternatively, the template over
+   * TQueue could be removed by moving the class definition to
+   * ALPAKA_ACCELERATOR_NAMESPACE.
+   */
+  template <typename TQueue, typename THostObject>
+  class MoveToDeviceCache {
+  public:
+    using Queue = TQueue;
+    using Device = alpaka::Dev<Queue>;
+    using HostObject = THostObject;
+    using Impl = detail::MoveToDeviceCacheImpl<Device, Queue, HostObject>;
+    using DeviceObject = typename Impl::DeviceObject;
+
+    static_assert(not(std::is_copy_constructible_v<HostObject> or std::is_copy_assignable_v<HostObject>),
+                  "The data object to be moved to device must not be copyable.");
+
+    MoveToDeviceCache(HostObject&& srcData) : data_(std::move(srcData)) {}
+
+    // TODO: I could make this function to return the contained object
+    // in case of alpaka buffer, PortableObject, or PortableCollection
+    // (in PortableCollection case it would be the View)
+    DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); }
+
+    DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); }
+
+  private:
+    Impl data_;
+  };
+}  // namespace cms::alpakatools
+
+#endif
diff --git a/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml b/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml
index a4058755409f7..53652048838f7 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml
@@ -19,6 +19,7 @@
   The dependency on "DataFormats/PortableTestObjects" automatically expands to include
   the host-only library (if it exists) and the corresponding Alpaka libraries (if they exist)
   -->
+  <use name="DataFormats/Portable"/>
   <use name="DataFormats/PortableTestObjects"/>
   <use name="DataFormats/TestObjects"/>
   <use name="FWCore/Framework"/>
diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
new file mode 100644
index 0000000000000..4ca7888002872
--- /dev/null
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
@@ -0,0 +1,84 @@
+#include "DataFormats/Portable/interface/PortableObject.h"
+#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/InputTag.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h"
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h"
+#include "HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+
+#include "TestAlgo.h"
+
+namespace ALPAKA_ACCELERATOR_NAMESPACE {
+  /**
+   * This class demonstrates a global EDProducer that
+   * - uses a MoveToDeviceCache to copy some host-side data to the devices of the backend.
+   * - produces a device EDProduct (that can get transferred to host automatically)
+   */
+  class TestAlpakaGlobalProducerMoveToDeviceCache : public global::EDProducer<> {
+  public:
+    TestAlpakaGlobalProducerMoveToDeviceCache(edm::ParameterSet const& config)
+        : getToken_(consumes(config.getParameter<edm::InputTag>("source"))),
+          getTokenMulti2_(consumes(config.getParameter<edm::InputTag>("source"))),
+          getTokenMulti3_(consumes(config.getParameter<edm::InputTag>("source"))),
+          putToken_{produces()},
+          putTokenMulti2_{produces()},
+          putTokenMulti3_{produces()},
+          // create host-side object that gets implicitly copied to all devices of the backend
+          deviceCache_{[&config]() {
+            PortableHostObject<TestAlgo::UpdateInfo> obj(cms::alpakatools::host());
+            *obj = TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"),
+                                        config.getParameter<int32_t>("y"),
+                                        config.getParameter<int32_t>("z")};
+            return obj;
+          }()} {}
+
+    void produce(edm::StreamID, device::Event& iEvent, device::EventSetup const& iSetup) const override {
+      auto const& input = iEvent.get(getToken_);
+      auto const& inputMulti2 = iEvent.get(getTokenMulti2_);
+      auto const& inputMulti3 = iEvent.get(getTokenMulti3_);
+
+      // get the object corresponding to the Device the Event is being processed on
+      auto const& infoObj = deviceCache_.get(iEvent.queue());
+
+      // run the algorithm, potentially asynchronously
+      auto deviceProduct = algo_.update(iEvent.queue(), input, infoObj.data());
+      auto deviceProductMulti2 = algo_.updateMulti2(iEvent.queue(), inputMulti2, infoObj.data());
+      auto deviceProductMulti3 = algo_.updateMulti3(iEvent.queue(), inputMulti3, infoObj.data());
+
+      iEvent.emplace(putToken_, std::move(deviceProduct));
+      iEvent.emplace(putTokenMulti2_, std::move(deviceProductMulti2));
+      iEvent.emplace(putTokenMulti3_, std::move(deviceProductMulti3));
+    }
+
+    static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+      edm::ParameterSetDescription desc;
+
+      desc.add("source", edm::InputTag{});
+      desc.add<int32_t>("x", 0);
+      desc.add<int32_t>("y", 1);
+      desc.add<int32_t>("z", 2);
+
+      descriptions.addWithDefaultLabel(desc);
+    }
+
+  private:
+    const device::EDGetToken<portabletest::TestDeviceCollection> getToken_;
+    const device::EDGetToken<portabletest::TestDeviceMultiCollection2> getTokenMulti2_;
+    const device::EDGetToken<portabletest::TestDeviceMultiCollection3> getTokenMulti3_;
+    const device::EDPutToken<portabletest::TestDeviceCollection> putToken_;
+    const device::EDPutToken<portabletest::TestDeviceMultiCollection2> putTokenMulti2_;
+    const device::EDPutToken<portabletest::TestDeviceMultiCollection3> putTokenMulti3_;
+
+    // implementation of the algorithm
+    TestAlgo algo_;
+
+    cms::alpakatools::MoveToDeviceCache<Queue, PortableHostObject<TestAlgo::UpdateInfo>> deviceCache_;
+  };
+
+}  // namespace ALPAKA_ACCELERATOR_NAMESPACE
+
+#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h"
+DEFINE_FWK_ALPAKA_MODULE(TestAlpakaGlobalProducerMoveToDeviceCache);
diff --git a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py
index bdacd11d7e4c2..ac39117119cce 100644
--- a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py
+++ b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py
@@ -84,6 +84,12 @@
     y = cms.int32(4),
     z = cms.int32(5),
 )
+process.alpakaGlobalProducerMoveToDeviceCache = cms.EDProducer("TestAlpakaGlobalProducerMoveToDeviceCache@alpaka",
+    source = cms.InputTag("alpakaGlobalProducer"),
+    x = cms.int32(32),
+    y = cms.int32(42),
+    z = cms.int32(52),
+)
 process.alpakaStreamProducer = cms.EDProducer("TestAlpakaStreamProducer@alpaka",
     source = cms.InputTag("intProduct"),
     eventSetupSource = cms.ESInputTag("alpakaESProducerB", "explicitLabel"),
@@ -132,6 +138,10 @@
     source = "alpakaGlobalProducerCopyToDeviceCache",
     expectXvalues = cms.vdouble([3]*10)
 )
+process.alpakaGlobalConsumerMoveToDeviceCache = process.alpakaGlobalConsumer.clone(
+    source = "alpakaGlobalProducerMoveToDeviceCache",
+    expectXvalues = cms.vdouble([32]*10)
+)
 process.alpakaStreamConsumer = cms.EDAnalyzer("TestAlpakaAnalyzer",
     source = cms.InputTag("alpakaStreamProducer"),
     expectSize = cms.int32(5),
@@ -163,7 +173,8 @@
 if args.moduleBackend != "":
     for name in ["ESProducerA", "ESProducerB", "ESProducerC", "ESProducerD", "ESProducerE", "ESProducerAMulti",
                  "ESProducerNull",
-                 "GlobalProducer", "GlobalProducerE", "GlobalProducerCopyToDeviceCache",
+                 "GlobalProducer", "GlobalProducerE",
+                 "GlobalProducerCopyToDeviceCache", "GlobalProducerMoveToDeviceCache",
                  "StreamProducer", "StreamInstanceProducer",
                  "StreamSynchronizingProducer", "StreamSynchronizingProducerToDevice",
                  "GlobalDeviceConsumer", "StreamDeviceConsumer",
@@ -180,6 +191,8 @@ def setExpect(m, size):
     process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(20-10))
     setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size=20)
     process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*20
+    setExpect(process.alpakaGlobalConsumerMoveToDeviceCache, size=20)
+    process.alpakaGlobalConsumerMoveToDeviceCache.expectXvalues = [32]*20
     setExpect(process.alpakaStreamConsumer, size=25)
     setExpect(process.alpakaStreamInstanceConsumer, size=36)
     setExpect(process.alpakaStreamSynchronizingConsumer, size=20)
@@ -192,6 +205,8 @@ def setExpect(m, size):
     process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(30-10))
     setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size = 30)
     process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*30
+    setExpect(process.alpakaGlobalConsumerMoveToDeviceCache, size = 30)
+    process.alpakaGlobalConsumerMoveToDeviceCache.expectXvalues = [32]*30
     setExpect(process.alpakaStreamConsumer, size = 125)
     setExpect(process.alpakaStreamInstanceConsumer, size = 216)
     setExpect(process.alpakaStreamSynchronizingConsumer, size = 30)
@@ -211,6 +226,7 @@ def setExpect(m, size):
     process.alpakaGlobalProducer,
     process.alpakaGlobalProducerE,
     process.alpakaGlobalProducerCopyToDeviceCache,
+    process.alpakaGlobalProducerMoveToDeviceCache,
     process.alpakaStreamProducer,
     process.alpakaStreamInstanceProducer,
     process.alpakaStreamSynchronizingProducer,
@@ -221,6 +237,7 @@ def setExpect(m, size):
     process.alpakaGlobalDeviceConsumer+
     process.alpakaGlobalConsumerE+
     process.alpakaGlobalConsumerCopyToDeviceCache+
+    process.alpakaGlobalConsumerMoveToDeviceCache+
     process.alpakaStreamConsumer+
     process.alpakaStreamDeviceConsumer+
     process.alpakaStreamInstanceConsumer+

From 61a5c3a11ca1e088f0ebb685ff59f88326d1ff1f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 8 Feb 2024 23:13:49 +0100
Subject: [PATCH 5/9] Allow the contained object of PortableHostObject to be
 initialized in the constructor

---
 .../Portable/interface/PortableHostObject.h   | 22 +++++++++++
 .../test/test_catch2_portableObjectOnHost.cc  | 38 +++++++++++++++++--
 2 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/DataFormats/Portable/interface/PortableHostObject.h b/DataFormats/Portable/interface/PortableHostObject.h
index b2f84b38be2dc..a2051a6ff2ab9 100644
--- a/DataFormats/Portable/interface/PortableHostObject.h
+++ b/DataFormats/Portable/interface/PortableHostObject.h
@@ -20,16 +20,30 @@ class PortableHostObject {
   using Buffer = cms::alpakatools::host_buffer<Product>;
   using ConstBuffer = cms::alpakatools::const_host_buffer<Product>;
 
+  static_assert(std::is_trivially_destructible_v<Product>);
+
   PortableHostObject() = delete;
 
   PortableHostObject(edm::Uninitialized) noexcept {}
 
+  // Note that in contrast to the variadic template overload, this
+  // constructor does not initialize the contained object
   PortableHostObject(alpaka_common::DevHost const& host)
       // allocate pageable host memory
       : buffer_{cms::alpakatools::make_host_buffer<Product>()}, product_{buffer_->data()} {
     assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0);
   }
 
+  template <typename... Args>
+  PortableHostObject(alpaka_common::DevHost const& host, Args&&... args)
+      // allocate pageable host memory
+      : buffer_{cms::alpakatools::make_host_buffer<Product>()},
+        product_{new(buffer_->data()) Product(std::forward<Args>(args)...)} {
+    assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0);
+  }
+
+  // Note that in contrast to the variadic template overload, this
+  // constructor does not initialize the contained object
   template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
   PortableHostObject(TQueue const& queue)
       // allocate pinned host memory associated to the given work queue, accessible by the queue's device
@@ -37,6 +51,14 @@ class PortableHostObject {
     assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0);
   }
 
+  template <typename TQueue, typename... Args, typename = std::enable_if_t<alpaka::isQueue<TQueue>>>
+  PortableHostObject(TQueue const& queue, Args&&... args)
+      // allocate pinned host memory associated to the given work queue, accessible by the queue's device
+      : buffer_{cms::alpakatools::make_host_buffer<Product>(queue)},
+        product_{new(buffer_->data()) Product(std::forward<Args>(args)...)} {
+    assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0);
+  }
+
   // non-copyable
   PortableHostObject(PortableHostObject const&) = delete;
   PortableHostObject& operator=(PortableHostObject const&) = delete;
diff --git a/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc b/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc
index 698605b57f465..4afe56be3e322 100644
--- a/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc
+++ b/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc
@@ -14,10 +14,40 @@ namespace {
 
 // This test is currently mostly about the code compiling
 TEST_CASE("Use of PortableObject<T> on host code", s_tag) {
-  PortableObject<Test, alpaka::DevCpu> obj(cms::alpakatools::host());
-  obj->a = 42;
+  static_assert(std::is_same_v<PortableObject<Test, alpaka::DevCpu>, PortableHostObject<Test>>);
 
-  SECTION("Tests") { REQUIRE(obj->a == 42); }
+  SECTION("Initialize by setting members") {
+    SECTION("With device") {
+      PortableObject<Test, alpaka::DevCpu> obj(cms::alpakatools::host());
+      obj->a = 42;
 
-  static_assert(std::is_same_v<PortableObject<Test, alpaka::DevCpu>, PortableHostObject<Test>>);
+      REQUIRE(obj->a == 42);
+    }
+
+    SECTION("With queue") {
+      alpaka::QueueCpuBlocking queue(cms::alpakatools::host());
+
+      PortableObject<Test, alpaka::DevCpu> obj(queue);
+      obj->a = 42;
+
+      REQUIRE(obj->a == 42);
+    }
+  }
+
+  SECTION("Initialize via constructor") {
+    SECTION("With device") {
+      PortableObject<Test, alpaka::DevCpu> obj(cms::alpakatools::host(), Test{42, 3.14f});
+
+      REQUIRE(obj->a == 42);
+      REQUIRE(obj->b == 3.14f);
+    }
+
+    SECTION("With queue") {
+      alpaka::QueueCpuBlocking queue(cms::alpakatools::host());
+      PortableObject<Test, alpaka::DevCpu> obj(queue, Test{42, 3.14f});
+
+      REQUIRE(obj->a == 42);
+      REQUIRE(obj->b == 3.14f);
+    }
+  }
 }

From 3d6f3204b8d455cce075a174845700f46ee2b29a Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 8 Feb 2024 23:20:42 +0100
Subject: [PATCH 6/9] Replace lambda with a direct initialization of
 PortableHostObject

---
 .../TestAlpakaGlobalProducerMoveToDeviceCache.cc     | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
index 4ca7888002872..51d756e5dbc8f 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
@@ -27,13 +27,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
           putTokenMulti2_{produces()},
           putTokenMulti3_{produces()},
           // create host-side object that gets implicitly copied to all devices of the backend
-          deviceCache_{[&config]() {
-            PortableHostObject<TestAlgo::UpdateInfo> obj(cms::alpakatools::host());
-            *obj = TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"),
-                                        config.getParameter<int32_t>("y"),
-                                        config.getParameter<int32_t>("z")};
-            return obj;
-          }()} {}
+          deviceCache_{
+              PortableHostObject<TestAlgo::UpdateInfo>{cms::alpakatools::host(),
+                                                       TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"),
+                                                                            config.getParameter<int32_t>("y"),
+                                                                            config.getParameter<int32_t>("z")}}} {}
 
     void produce(edm::StreamID, device::Event& iEvent, device::EventSetup const& iSetup) const override {
       auto const& input = iEvent.get(getToken_);

From ad1db1651f5e6cd7f03c096724795afe92e6c932 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 9 Feb 2024 08:18:03 -0600
Subject: [PATCH 7/9] Document CopyToDeviceCache and MoveToDeviceCache in
 README

---
 HeterogeneousCore/AlpakaCore/README.md | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/HeterogeneousCore/AlpakaCore/README.md b/HeterogeneousCore/AlpakaCore/README.md
index 43ed5d0778844..586db296972c9 100644
--- a/HeterogeneousCore/AlpakaCore/README.md
+++ b/HeterogeneousCore/AlpakaCore/README.md
@@ -185,6 +185,26 @@ In the [`fillDescriptions()`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWG
 
 Also note that the `fillDescription()` function must have the same content for all backends, i.e. any backend-specific behavior with e.g. `#ifdef` or `if constexpr` are forbidden.
 
+### Copy e.g. configuration data to all devices in EDProducer
+
+While the EventSetup can be used to handle copying data to all devices
+of an Alpaka backend, for data used only by one EDProducer a simpler
+way would be to use one of
+* `cms::alpakatools::MoveToDeviceCache<TQueue, THostObject>` (recommended)
+  * `#include "HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h"`
+  * Moves the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. On host backends the argument `THostObject` is moved around, but not copied.
+  * The `THostObject` must not be copyable
+    * This is to avoid easy mistakes with objects that follow copy semantics of `std::shared_ptr` (that includes Alpaka buffers), that would allow the source memory buffer to be used via another copy during the asynchronous data copy to the device.
+  * The constructor argument `THostObject` object may not be used, unless it is initialized again e.g. by assigning another `THostObject` into it.
+  * The corresponding device-side object can be obtained with `get()` member function using either alpaka Device or Queue object. It can be used immediately after the constructor returns.
+* `cms::alpakatools::CopyToDeviceCache<TQueue, THostObject>` (use only if **must** use copyable `THostObject`)
+  * `#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h"`
+  * Copies the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. Also host backends do a copy.
+  * The constructor argument `THostObject` object can be used for other purposes immediately after the constructor returns
+  * The corresponding device-side object can be obtained with `get()` member function using either alpaka Device or Queue object. It can be used immediately after the constructor returns.
+
+For examples see [`HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc`](../../HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc) and [`HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc`](../../HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc).
+
 ## Guarantees
 
 * All Event data products in the device memory space are guaranteed to be accessible only for operations enqueued in the `Queue` given by `device::Event::queue()` when accessed through the `device::Event`.

From 9814921854dc5018a3558f151c865f1f5639f6d3 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 17 Dec 2024 22:33:38 +0100
Subject: [PATCH 8/9] Template {Copy,Move}ToDevice over Device instead of Queue

---
 HeterogeneousCore/AlpakaCore/README.md        |  4 +-
 .../AlpakaCore/interface/CopyToDeviceCache.h  | 37 ++++++++-----------
 .../AlpakaCore/interface/MoveToDeviceCache.h  | 35 +++++++-----------
 ...stAlpakaGlobalProducerCopyToDeviceCache.cc |  2 +-
 ...stAlpakaGlobalProducerMoveToDeviceCache.cc |  2 +-
 5 files changed, 33 insertions(+), 47 deletions(-)

diff --git a/HeterogeneousCore/AlpakaCore/README.md b/HeterogeneousCore/AlpakaCore/README.md
index 586db296972c9..844b14a8be92b 100644
--- a/HeterogeneousCore/AlpakaCore/README.md
+++ b/HeterogeneousCore/AlpakaCore/README.md
@@ -190,14 +190,14 @@ Also note that the `fillDescription()` function must have the same content for a
 While the EventSetup can be used to handle copying data to all devices
 of an Alpaka backend, for data used only by one EDProducer a simpler
 way would be to use one of
-* `cms::alpakatools::MoveToDeviceCache<TQueue, THostObject>` (recommended)
+* `cms::alpakatools::MoveToDeviceCache<TDevice, THostObject>` (recommended)
   * `#include "HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h"`
   * Moves the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. On host backends the argument `THostObject` is moved around, but not copied.
   * The `THostObject` must not be copyable
     * This is to avoid easy mistakes with objects that follow copy semantics of `std::shared_ptr` (that includes Alpaka buffers), that would allow the source memory buffer to be used via another copy during the asynchronous data copy to the device.
   * The constructor argument `THostObject` object may not be used, unless it is initialized again e.g. by assigning another `THostObject` into it.
   * The corresponding device-side object can be obtained with `get()` member function using either alpaka Device or Queue object. It can be used immediately after the constructor returns.
-* `cms::alpakatools::CopyToDeviceCache<TQueue, THostObject>` (use only if **must** use copyable `THostObject`)
+* `cms::alpakatools::CopyToDeviceCache<TDevice, THostObject>` (use only if **must** use copyable `THostObject`)
   * `#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h"`
   * Copies the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. Also host backends do a copy.
   * The constructor argument `THostObject` object can be used for other purposes immediately after the constructor returns
diff --git a/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h
index f33c8c170c90c..3e3a04f0c1834 100644
--- a/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h
+++ b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h
@@ -15,11 +15,11 @@ namespace cms::alpakatools {
     // std::conditional_t and if constexpr) because the
     // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for
     // PortableCollection on host device
-    template <typename TDev, typename TQueue, typename THostObject>
+    template <typename TDevice, typename THostObject>
     class CopyToDeviceCacheImpl {
     public:
-      using Device = TDev;
-      using Queue = TQueue;
+      using Device = TDevice;
+      using Queue = alpaka::Queue<Device, alpaka::NonBlocking>;
       using HostObject = THostObject;
       using Copy = CopyToDevice<HostObject>;
       using DeviceObject = decltype(Copy::copyAsync(std::declval<Queue&>(), std::declval<HostObject const&>()));
@@ -47,8 +47,8 @@ namespace cms::alpakatools {
     };
 
     // For host device, copy the host object directly instead
-    template <typename TQueue, typename THostObject>
-    class CopyToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> {
+    template <typename THostObject>
+    class CopyToDeviceCacheImpl<alpaka_common::DevHost, THostObject> {
     public:
       using HostObject = THostObject;
       using DeviceObject = HostObject;
@@ -63,9 +63,9 @@ namespace cms::alpakatools {
   }  // namespace detail
 
   /**
-   * This class template implements a cache for data that is copied
+   * This class template implements a cache for data that is moved
    * from the host (of type THostObject) to all the devices
-   * corresponding the TQueue queue type.
+   * corresponding to the TDevice device type.
    *
    * The host-side object to be copied is given as an argument to the
    * class constructor. The constructor uses the
@@ -77,31 +77,24 @@ namespace cms::alpakatools {
    * type is the return type of CopyToDevice<THostObject>::copyAsync())
    * can be obtained with get() member function, that has either the
    * queue or device argument.
-   *
-   * TODO: In principle it would be better to template over Device,
-   * but then we'd need a way to have a "default queue" type for each
-   * Device in order to infer the return type of
-   * CopyToDevice::copyAsync(). Alternatively, the template over
-   * TQueue could be removed by moving the class definition to
-   * ALPAKA_ACCELERATOR_NAMESPACE.
    */
-  template <typename TQueue, typename THostObject>
+  template <typename TDevice, typename THostObject>
+    requires alpaka::isDevice<TDevice>
   class CopyToDeviceCache {
-    using Queue = TQueue;
-    using Device = alpaka::Dev<Queue>;
+    using Device = TDevice;
     using HostObject = THostObject;
-    using Impl = detail::CopyToDeviceCacheImpl<Device, Queue, HostObject>;
+    using Impl = detail::CopyToDeviceCacheImpl<Device, HostObject>;
     using DeviceObject = typename Impl::DeviceObject;
 
   public:
     CopyToDeviceCache(THostObject const& srcData) : data_(srcData) {}
 
-    // TODO: I could make this function to return the contained object
-    // in case of alpaka buffer, PortableObject, or PortableCollection
-    // (in PortableCollection case it would be the View)
     DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); }
 
-    DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); }
+    template <typename TQueue>
+    DeviceObject const& get(TQueue const& queue) const {
+      return get(alpaka::getDev(queue));
+    }
 
   private:
     Impl data_;
diff --git a/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h
index 2f5bcbb765bac..2c66fd384798a 100644
--- a/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h
+++ b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h
@@ -17,11 +17,11 @@ namespace cms::alpakatools {
     // std::conditional_t and if constexpr) because the
     // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for
     // PortableCollection on host device
-    template <typename TDev, typename TQueue, typename THostObject>
+    template <typename TDevice, typename THostObject>
     class MoveToDeviceCacheImpl {
     public:
       using HostObject = THostObject;
-      using Impl = CopyToDeviceCacheImpl<TDev, TQueue, THostObject>;
+      using Impl = CopyToDeviceCacheImpl<TDevice, THostObject>;
       using DeviceObject = typename Impl::DeviceObject;
 
       MoveToDeviceCacheImpl(HostObject&& srcObject) : impl_(srcObject) {}
@@ -33,8 +33,8 @@ namespace cms::alpakatools {
     };
 
     // For host device, move the host object instead
-    template <typename TQueue, typename THostObject>
-    class MoveToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> {
+    template <typename THostObject>
+    class MoveToDeviceCacheImpl<alpaka_common::DevHost, THostObject> {
     public:
       using HostObject = THostObject;
       using DeviceObject = HostObject;
@@ -51,14 +51,14 @@ namespace cms::alpakatools {
   /**
    * This class template implements a cache for data that is moved
    * from the host (of type THostObject) to all the devices
-   * corresponding the TQueue queue type.
+   * corresponding to the TDevice device type.
    *
    * The host-side object to be moved is given as an argument to the
    * class constructor. The constructor uses the
    * CopyToDevice<THostObject> class template to copy the data to the
    * devices, and waits for the data copies to finish, i.e. the
    * constructor is synchronous wrt. the data copies. The "move" is
-   * achieved by requiring the constructor argument to the rvalue
+   * achieved by requiring the constructor argument to be an rvalue
    * reference.
    *
    * Note that the host object type is required to be non-copyable.
@@ -71,21 +71,14 @@ namespace cms::alpakatools {
    * type is the return type of CopyToDevice<THostObject>::copyAsync())
    * can be obtained with get() member function, that has either the
    * queue or device argument.
-   *
-   * TODO: In principle it would be better to template over Device,
-   * but then we'd need a way to have a "default queue" type for each
-   * Device in order to infer the return type of
-   * CopyToDevice::copyAsync(). Alternatively, the template over
-   * TQueue could be removed by moving the class definition to
-   * ALPAKA_ACCELERATOR_NAMESPACE.
    */
-  template <typename TQueue, typename THostObject>
+  template <typename TDevice, typename THostObject>
+    requires alpaka::isDevice<TDevice>
   class MoveToDeviceCache {
   public:
-    using Queue = TQueue;
-    using Device = alpaka::Dev<Queue>;
+    using Device = TDevice;
     using HostObject = THostObject;
-    using Impl = detail::MoveToDeviceCacheImpl<Device, Queue, HostObject>;
+    using Impl = detail::MoveToDeviceCacheImpl<Device, HostObject>;
     using DeviceObject = typename Impl::DeviceObject;
 
     static_assert(not(std::is_copy_constructible_v<HostObject> or std::is_copy_assignable_v<HostObject>),
@@ -93,12 +86,12 @@ namespace cms::alpakatools {
 
     MoveToDeviceCache(HostObject&& srcData) : data_(std::move(srcData)) {}
 
-    // TODO: I could make this function to return the contained object
-    // in case of alpaka buffer, PortableObject, or PortableCollection
-    // (in PortableCollection case it would be the View)
     DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); }
 
-    DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); }
+    template <typename TQueue>
+    DeviceObject const& get(TQueue const& queue) const {
+      return get(alpaka::getDev(queue));
+    }
 
   private:
     Impl data_;
diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc
index 331b85d093bda..8d8ea5b7ac181 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc
@@ -74,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     // implementation of the algorithm
     TestAlgo algo_;
 
-    cms::alpakatools::CopyToDeviceCache<Queue, cms::alpakatools::host_buffer<TestAlgo::UpdateInfo>> deviceCache_;
+    cms::alpakatools::CopyToDeviceCache<Device, cms::alpakatools::host_buffer<TestAlgo::UpdateInfo>> deviceCache_;
   };
 
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE
diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
index 51d756e5dbc8f..6c8fe100f0ec5 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc
@@ -73,7 +73,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
     // implementation of the algorithm
     TestAlgo algo_;
 
-    cms::alpakatools::MoveToDeviceCache<Queue, PortableHostObject<TestAlgo::UpdateInfo>> deviceCache_;
+    cms::alpakatools::MoveToDeviceCache<Device, PortableHostObject<TestAlgo::UpdateInfo>> deviceCache_;
   };
 
 }  // namespace ALPAKA_ACCELERATOR_NAMESPACE

From 25fe7a6f4aac8d8c368c04631f04bd35eace7032 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 17 Dec 2024 15:42:38 -0600
Subject: [PATCH 9/9] Use Acc1D directly in TestAlgo

Co-authored-by: Andrea Bocci <fwyzard@gmail.com>
---
 .../AlpakaTest/plugins/alpaka/TestAlgo.dev.cc            | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
index a9034ead09e0f..53d7318c907df 100644
--- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
+++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc
@@ -177,8 +177,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       }
     }
 
-    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+    ALPAKA_FN_ACC void operator()(Acc1D const& acc,
                                   portabletest::TestDeviceCollection::ConstView input,
                                   TestAlgo::UpdateInfo const* updateInfo,
                                   portabletest::TestDeviceCollection::View output) const {
@@ -228,8 +227,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       }
     }
 
-    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+    ALPAKA_FN_ACC void operator()(Acc1D const& acc,
                                   portabletest::TestSoA::ConstView input,
                                   portabletest::TestSoA2::ConstView input2,
                                   TestAlgo::UpdateInfo const* updateInfo,
@@ -299,8 +297,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE {
       }
     }
 
-    template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>>
-    ALPAKA_FN_ACC void operator()(TAcc const& acc,
+    ALPAKA_FN_ACC void operator()(Acc1D const& acc,
                                   portabletest::TestSoA::ConstView input,
                                   portabletest::TestSoA2::ConstView input2,
                                   portabletest::TestSoA3::ConstView input3,