From c3773f7568e7e16520c50a2f053eee892d7c1415 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Wed, 22 Nov 2023 21:18:21 +0100 Subject: [PATCH 1/9] Add specialization of CopyToDevice for alpaka host buffer --- .../AlpakaInterface/interface/CopyToDevice.h | 31 +++++++++ .../AlpakaInterface/test/BuildFile.xml | 7 ++ .../test/alpaka/testCopyBufferToDevice.dev.cc | 68 +++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc diff --git a/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h b/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h index 8fad8d0729f1a..f98d475185ff3 100644 --- a/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h +++ b/HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h @@ -32,4 +32,35 @@ namespace cms::alpakatools { struct CopyToDevice; } // namespace cms::alpakatools +// specialize to Alpaka buffer +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +namespace cms::alpakatools { + // Note: can't do partial specializations along + // - CopyToDevice<host_buffer<TObject>> + // - CopyToDevice<alpaka::Buf<alpaka_common::DevHost, TObject, alpaka_common::Dim0D, alpaka_common::Idx>e + // because both host_buffer and alpaka::Buf use trait-style + // indirection that prevents template argument type deduction + template <typename TObject> + struct CopyToDevice<alpaka::BufCpu<TObject, alpaka_common::Dim0D, alpaka_common::Idx>> { + template <typename TQueue> + static auto copyAsync(TQueue& queue, host_buffer<TObject> const& src) { + using TDevice = alpaka::Dev<TQueue>; + auto dst = make_device_buffer<TObject>(queue); + alpaka::memcpy(queue, dst, src); + return dst; + } + }; + + template <typename TObject> + struct CopyToDevice<alpaka::BufCpu<TObject, alpaka_common::Dim1D, alpaka_common::Idx>> { + template <typename TQueue> + static auto copyAsync(TQueue& queue, host_buffer<TObject[]> const& src) { + using TDevice = alpaka::Dev<TQueue>; + auto dst = make_device_buffer<TObject[]>(queue, alpaka::getExtentProduct(src)); + alpaka::memcpy(queue, dst, src); + return dst; + } + }; +} // namespace cms::alpakatools + #endif diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index 426a750e3d0b9..77aa10cc5a171 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -19,6 +19,13 @@ <flags ALPAKA_BACKENDS="1"/> </bin> +<bin name="alpakaCopyBufferToDevice" file="alpaka/testCopyBufferToDevice.dev.cc"> + <use name="alpaka"/> + <use name="catch2"/> + <use name="HeterogeneousCore/AlpakaInterface"/> + <flags ALPAKA_BACKENDS="1"/> +</bin> + <bin name="alpakaTestBackend" file="testBackend.cc"> <use name="catch2"/> <use name="HeterogeneousCore/AlpakaInterface"/> diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc new file mode 100644 index 0000000000000..209c01a6f641f --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc @@ -0,0 +1,68 @@ +#include <alpaka/alpaka.hpp> + +#define CATCH_CONFIG_MAIN +#include <catch.hpp> + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h" + +// each test binary is built for a single Alpaka backend +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +namespace { + struct Dummy { + int x, y, z; + }; +} + +TEST_CASE("Test CopyToDevice for Alpaka buffers for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend", + "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") { + SECTION("Buffer of scalar") { + auto buffer_host = cms::alpakatools::make_host_buffer<Dummy>(); + + // run the test on each device + for (auto const& device : cms::alpakatools::devices<Platform>()) { + auto queue = Queue(device); + using Copy = cms::alpakatools::CopyToDevice<decltype(buffer_host)>; + auto buffer_device = Copy::copyAsync(queue, buffer_host); + alpaka::wait(queue); + } + } + + SECTION("Buffer of array with static size") { + // The buffer itself is really dynamically sized, even if the + // alpakatools API looks like the array would have static size + constexpr int N = 10; + auto buffer_host = cms::alpakatools::make_host_buffer<int[N]>(); + for (int i = 0; i < N; ++i) { + buffer_host[i] = i; + } + + // run the test on each device + for (auto const& device : cms::alpakatools::devices<Platform>()) { + auto queue = Queue(device); + using Copy = cms::alpakatools::CopyToDevice<decltype(buffer_host)>; + auto buffer_device = Copy::copyAsync(queue, buffer_host); + alpaka::wait(queue); + REQUIRE(alpaka::getExtentProduct(buffer_device) == N); + } + } + + SECTION("Buffer of array with dynamic size") { + constexpr int N = 10; + auto buffer_host = cms::alpakatools::make_host_buffer<int[]>(N); + for (int i = 0; i < N; ++i) { + buffer_host[i] = i; + } + + // run the test on each device + for (auto const& device : cms::alpakatools::devices<Platform>()) { + auto queue = Queue(device); + using Copy = cms::alpakatools::CopyToDevice<decltype(buffer_host)>; + auto buffer_device = Copy::copyAsync(queue, buffer_host); + alpaka::wait(queue); + REQUIRE(alpaka::getExtentProduct(buffer_device) == N); + } + } +} From 59de41482ea0868dc4318cb5c1dfa4e84139448c Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Wed, 22 Nov 2023 18:51:36 +0100 Subject: [PATCH 2/9] Add CopyToDeviceCache class template --- .../AlpakaCore/interface/CopyToDeviceCache.h | 111 ++++++++++++ .../test/alpaka/testCopyBufferToDevice.dev.cc | 2 +- .../AlpakaTest/plugins/alpaka/TestAlgo.dev.cc | 161 ++++++++++++++++++ .../AlpakaTest/plugins/alpaka/TestAlgo.h | 14 ++ ...stAlpakaGlobalProducerCopyToDeviceCache.cc | 83 +++++++++ .../AlpakaTest/test/testAlpakaModules_cfg.py | 18 +- 6 files changed, 387 insertions(+), 2 deletions(-) create mode 100644 HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h create mode 100644 HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc diff --git a/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h new file mode 100644 index 0000000000000..f33c8c170c90c --- /dev/null +++ b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h @@ -0,0 +1,111 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_CopyToDeviceCache_h +#define HeterogeneousCore_AlpakaInterface_interface_CopyToDeviceCache_h + +#include <alpaka/alpaka.hpp> + +#include "HeterogeneousCore/AlpakaCore/interface/QueueCache.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" + +namespace cms::alpakatools { + namespace detail { + // By default copy the host object with CopyToDevice<T> + // + // Doing with template specialization (rather than + // std::conditional_t and if constexpr) because the + // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for + // PortableCollection on host device + template <typename TDev, typename TQueue, typename THostObject> + class CopyToDeviceCacheImpl { + public: + using Device = TDev; + using Queue = TQueue; + using HostObject = THostObject; + using Copy = CopyToDevice<HostObject>; + using DeviceObject = decltype(Copy::copyAsync(std::declval<Queue&>(), std::declval<HostObject const&>())); + + CopyToDeviceCacheImpl(HostObject const& srcObject) { + using Platform = alpaka::Platform<Device>; + auto const& devices = cms::alpakatools::devices<Platform>(); + std::vector<std::shared_ptr<Queue>> queues; + queues.reserve(devices.size()); + data_.reserve(devices.size()); + for (auto const& dev : devices) { + auto queue = getQueueCache<Queue>().get(dev); + data_.emplace_back(Copy::copyAsync(*queue, srcObject)); + queues.emplace_back(std::move(queue)); + } + for (auto& queuePtr : queues) { + alpaka::wait(*queuePtr); + } + } + + DeviceObject const& get(size_t i) const { return data_[i]; } + + private: + std::vector<DeviceObject> data_; + }; + + // For host device, copy the host object directly instead + template <typename TQueue, typename THostObject> + class CopyToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> { + public: + using HostObject = THostObject; + using DeviceObject = HostObject; + + CopyToDeviceCacheImpl(HostObject const& srcObject) : data_(srcObject) {} + + DeviceObject const& get(size_t i) const { return data_; } + + private: + HostObject data_; + }; + } // namespace detail + + /** + * This class template implements a cache for data that is copied + * from the host (of type THostObject) to all the devices + * corresponding the TQueue queue type. + * + * The host-side object to be copied is given as an argument to the + * class constructor. The constructor uses the + * CopyToDevice<THostObject> class template to perfom the copy, and + * waits for the data copies to finish, i.e. the constructor is + * synchronous wrt. the data copies. + * + * The device-side object corresponding to the THostObject (actual + * type is the return type of CopyToDevice<THostObject>::copyAsync()) + * can be obtained with get() member function, that has either the + * queue or device argument. + * + * TODO: In principle it would be better to template over Device, + * but then we'd need a way to have a "default queue" type for each + * Device in order to infer the return type of + * CopyToDevice::copyAsync(). Alternatively, the template over + * TQueue could be removed by moving the class definition to + * ALPAKA_ACCELERATOR_NAMESPACE. + */ + template <typename TQueue, typename THostObject> + class CopyToDeviceCache { + using Queue = TQueue; + using Device = alpaka::Dev<Queue>; + using HostObject = THostObject; + using Impl = detail::CopyToDeviceCacheImpl<Device, Queue, HostObject>; + using DeviceObject = typename Impl::DeviceObject; + + public: + CopyToDeviceCache(THostObject const& srcData) : data_(srcData) {} + + // TODO: I could make this function to return the contained object + // in case of alpaka buffer, PortableObject, or PortableCollection + // (in PortableCollection case it would be the View) + DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); } + + DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); } + + private: + Impl data_; + }; +} // namespace cms::alpakatools + +#endif diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc index 209c01a6f641f..b4bbb2868412e 100644 --- a/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testCopyBufferToDevice.dev.cc @@ -14,7 +14,7 @@ namespace { struct Dummy { int x, y, z; }; -} +} // namespace TEST_CASE("Test CopyToDevice for Alpaka buffers for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend", "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") { diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc index 3afbc3d9d8103..a9034ead09e0f 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc @@ -176,6 +176,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()}; } } + + template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + portabletest::TestDeviceCollection::ConstView input, + TestAlgo::UpdateInfo const* updateInfo, + portabletest::TestDeviceCollection::View output) const { + // set this only once in the whole kernel grid + if (once_per_grid(acc)) { + output.r() = input.r(); + } + + // make a strided loop over the kernel grid, covering up to "size" elements + for (int32_t i : uniform_elements(acc, output.metadata().size())) { + double x = input[i].x(); + x += updateInfo->x; + output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()}; + } + } }; class TestAlgoKernelUpdateMulti2 { @@ -209,6 +227,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()}; } } + + template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + portabletest::TestSoA::ConstView input, + portabletest::TestSoA2::ConstView input2, + TestAlgo::UpdateInfo const* updateInfo, + portabletest::TestSoA::View output, + portabletest::TestSoA2::View output2) const { + // set this only once in the whole kernel grid + if (once_per_grid(acc)) { + output.r() = input.r(); + output2.r2() = input2.r2(); + } + + // make a strided loop over the kernel grid, covering up to "size" elements + for (int32_t i : uniform_elements(acc, output.metadata().size())) { + double x = input[i].x(); + x += updateInfo->x; + output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()}; + } + for (int32_t i : uniform_elements(acc, output2.metadata().size())) { + double x2 = input2[i].x2(); + x2 += updateInfo->x; + output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()}; + } + } }; class TestAlgoKernelUpdateMulti3 { @@ -254,6 +298,42 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()}; } } + + template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> + ALPAKA_FN_ACC void operator()(TAcc const& acc, + portabletest::TestSoA::ConstView input, + portabletest::TestSoA2::ConstView input2, + portabletest::TestSoA3::ConstView input3, + TestAlgo::UpdateInfo const* updateInfo, + portabletest::TestSoA::View output, + portabletest::TestSoA2::View output2, + portabletest::TestSoA3::View output3) const { + // set this only once in the whole kernel grid + if (once_per_grid(acc)) { + output.r() = input.r(); + output2.r2() = input2.r2(); + output3.r3() = input3.r3(); + } + + // make a strided loop over the kernel grid, covering up to "size" elements + for (int32_t i : uniform_elements(acc, output.metadata().size())) { + double x = input[i].x(); + x += updateInfo->x; + if (0 == i) + printf("Setting x[0] to %f\n", x); + output[i] = {x, input[i].y(), input[i].z(), input[i].id(), input[i].flags(), input[i].m()}; + } + for (int32_t i : uniform_elements(acc, output2.metadata().size())) { + double x2 = input2[i].x2(); + x2 += updateInfo->x; + output2[i] = {x2, input2[i].y2(), input2[i].z2(), input2[i].id2(), input2[i].m2()}; + } + for (int32_t i : uniform_elements(acc, output3.metadata().size())) { + double x3 = input3[i].x3(); + x3 += updateInfo->x; + output3[i] = {x3, input3[i].y3(), input3[i].z3(), input3[i].id3(), input3[i].m3()}; + } + } }; portabletest::TestDeviceCollection TestAlgo::update(Queue& queue, @@ -337,6 +417,87 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { return collection; } + portabletest::TestDeviceCollection TestAlgo::update(Queue& queue, + portabletest::TestDeviceCollection const& input, + UpdateInfo const* d_updateInfo) const { + portabletest::TestDeviceCollection collection{input->metadata().size(), queue}; + + // use 64 items per group (this value is arbitrary, but it's a reasonable starting point) + uint32_t items = 64; + + // use as many groups as needed to cover the whole problem + uint32_t groups = divide_up_by(collection->metadata().size(), items); + + // map items to + // - threads with a single element per thread on a GPU backend + // - elements within a single thread on a CPU backend + auto workDiv = make_workdiv<Acc1D>(groups, items); + + alpaka::exec<Acc1D>(queue, workDiv, TestAlgoKernelUpdate{}, input.view(), d_updateInfo, collection.view()); + + return collection; + } + + portabletest::TestDeviceMultiCollection2 TestAlgo::updateMulti2(Queue& queue, + portabletest::TestDeviceMultiCollection2 const& input, + UpdateInfo const* d_updateInfo) const { + portabletest::TestDeviceMultiCollection2 collection{input.sizes(), queue}; + + // use 64 items per group (this value is arbitrary, but it's a reasonable starting point) + uint32_t items = 64; + + // use as many groups as needed to cover the whole problem + auto sizes = collection.sizes(); + uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items); + + // map items to + // - threads with a single element per thread on a GPU backend + // - elements within a single thread on a CPU backend + auto workDiv = make_workdiv<Acc1D>(groups, items); + + alpaka::exec<Acc1D>(queue, + workDiv, + TestAlgoKernelUpdateMulti2{}, + input.view<portabletest::TestSoA>(), + input.view<portabletest::TestSoA2>(), + d_updateInfo, + collection.view<portabletest::TestSoA>(), + collection.view<portabletest::TestSoA2>()); + + return collection; + } + + portabletest::TestDeviceMultiCollection3 TestAlgo::updateMulti3(Queue& queue, + portabletest::TestDeviceMultiCollection3 const& input, + UpdateInfo const* d_updateInfo) const { + portabletest::TestDeviceMultiCollection3 collection{input.sizes(), queue}; + + // use 64 items per group (this value is arbitrary, but it's a reasonable starting point) + uint32_t items = 64; + + // use as many groups as needed to cover the whole problem + auto sizes = collection.sizes(); + uint32_t groups = divide_up_by(*std::max_element(sizes.begin(), sizes.end()), items); + + // map items to + // - threads with a single element per thread on a GPU backend + // - elements within a single thread on a CPU backend + auto workDiv = make_workdiv<Acc1D>(groups, items); + + alpaka::exec<Acc1D>(queue, + workDiv, + TestAlgoKernelUpdateMulti3{}, + input.view<portabletest::TestSoA>(), + input.view<portabletest::TestSoA2>(), + input.view<portabletest::TestSoA3>(), + d_updateInfo, + collection.view<portabletest::TestSoA>(), + collection.view<portabletest::TestSoA2>(), + collection.view<portabletest::TestSoA3>()); + + return collection; + } + class TestZeroCollectionKernel { public: template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h index f943eacddd1c3..dbebf60e898b5 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.h @@ -17,6 +17,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { portabletest::TestDeviceCollection update(Queue& queue, portabletest::TestDeviceCollection const& input, AlpakaESTestDataEDevice const& esData) const; + portabletest::TestDeviceMultiCollection2 updateMulti2(Queue& queue, portabletest::TestDeviceMultiCollection2 const& input, AlpakaESTestDataEDevice const& esData) const; @@ -24,6 +25,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { portabletest::TestDeviceMultiCollection3 const& input, AlpakaESTestDataEDevice const& esData) const; + struct UpdateInfo { + int x, y, z; + }; + portabletest::TestDeviceCollection update(Queue& queue, + portabletest::TestDeviceCollection const& input, + UpdateInfo const* d_updateInfo) const; + portabletest::TestDeviceMultiCollection2 updateMulti2(Queue& queue, + portabletest::TestDeviceMultiCollection2 const& input, + UpdateInfo const* d_updateInfo) const; + portabletest::TestDeviceMultiCollection3 updateMulti3(Queue& queue, + portabletest::TestDeviceMultiCollection3 const& input, + UpdateInfo const* d_updateInfo) const; + void fillMulti2(Queue& queue, portabletest::TestDeviceMultiCollection2& collection, double xvalue = 0.) const; void fillMulti3(Queue& queue, portabletest::TestDeviceMultiCollection3& collection, double xvalue = 0.) const; diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc new file mode 100644 index 0000000000000..331b85d093bda --- /dev/null +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc @@ -0,0 +1,83 @@ +#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +#include "TestAlgo.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + /** + * This class demonstrates a global EDProducer that + * - uses a CopyToDeviceCache to copy some host-side data to the devices of the backend. + * - produces a device EDProduct (that can get transferred to host automatically) + */ + class TestAlpakaGlobalProducerCopyToDeviceCache : public global::EDProducer<> { + public: + TestAlpakaGlobalProducerCopyToDeviceCache(edm::ParameterSet const& config) + : getToken_(consumes(config.getParameter<edm::InputTag>("source"))), + getTokenMulti2_(consumes(config.getParameter<edm::InputTag>("source"))), + getTokenMulti3_(consumes(config.getParameter<edm::InputTag>("source"))), + putToken_{produces()}, + putTokenMulti2_{produces()}, + putTokenMulti3_{produces()}, + // create host-side object that gets implicitly copied to all devices of the backend + deviceCache_{[&config]() { + auto buffer = cms::alpakatools::make_host_buffer<TestAlgo::UpdateInfo>(); + *buffer = TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"), + config.getParameter<int32_t>("y"), + config.getParameter<int32_t>("z")}; + return buffer; + }()} {} + + void produce(edm::StreamID, device::Event& iEvent, device::EventSetup const& iSetup) const override { + auto const& input = iEvent.get(getToken_); + auto const& inputMulti2 = iEvent.get(getTokenMulti2_); + auto const& inputMulti3 = iEvent.get(getTokenMulti3_); + + // get the object corresponding to the Device the Event is being processed on + auto const& infoBuffer = deviceCache_.get(iEvent.queue()); + + // run the algorithm, potentially asynchronously + auto deviceProduct = algo_.update(iEvent.queue(), input, infoBuffer.data()); + auto deviceProductMulti2 = algo_.updateMulti2(iEvent.queue(), inputMulti2, infoBuffer.data()); + auto deviceProductMulti3 = algo_.updateMulti3(iEvent.queue(), inputMulti3, infoBuffer.data()); + + iEvent.emplace(putToken_, std::move(deviceProduct)); + iEvent.emplace(putTokenMulti2_, std::move(deviceProductMulti2)); + iEvent.emplace(putTokenMulti3_, std::move(deviceProductMulti3)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("source", edm::InputTag{}); + desc.add<int32_t>("x", 0); + desc.add<int32_t>("y", 1); + desc.add<int32_t>("z", 2); + + descriptions.addWithDefaultLabel(desc); + } + + private: + const device::EDGetToken<portabletest::TestDeviceCollection> getToken_; + const device::EDGetToken<portabletest::TestDeviceMultiCollection2> getTokenMulti2_; + const device::EDGetToken<portabletest::TestDeviceMultiCollection3> getTokenMulti3_; + const device::EDPutToken<portabletest::TestDeviceCollection> putToken_; + const device::EDPutToken<portabletest::TestDeviceMultiCollection2> putTokenMulti2_; + const device::EDPutToken<portabletest::TestDeviceMultiCollection3> putTokenMulti3_; + + // implementation of the algorithm + TestAlgo algo_; + + cms::alpakatools::CopyToDeviceCache<Queue, cms::alpakatools::host_buffer<TestAlgo::UpdateInfo>> deviceCache_; + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h" +DEFINE_FWK_ALPAKA_MODULE(TestAlpakaGlobalProducerCopyToDeviceCache); diff --git a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py index 62279b26b3010..bdacd11d7e4c2 100644 --- a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py +++ b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py @@ -78,6 +78,12 @@ process.alpakaGlobalProducerE = cms.EDProducer("TestAlpakaGlobalProducerE@alpaka", source = cms.InputTag("alpakaGlobalProducer") ) +process.alpakaGlobalProducerCopyToDeviceCache = cms.EDProducer("TestAlpakaGlobalProducerCopyToDeviceCache@alpaka", + source = cms.InputTag("alpakaGlobalProducer"), + x = cms.int32(3), + y = cms.int32(4), + z = cms.int32(5), +) process.alpakaStreamProducer = cms.EDProducer("TestAlpakaStreamProducer@alpaka", source = cms.InputTag("intProduct"), eventSetupSource = cms.ESInputTag("alpakaESProducerB", "explicitLabel"), @@ -122,6 +128,10 @@ source = "alpakaGlobalProducerE", expectXvalues = cms.vdouble([(i%2)*10+1 + abs(27)+i*2 for i in range(0,5)] + [0]*5) ) +process.alpakaGlobalConsumerCopyToDeviceCache = process.alpakaGlobalConsumer.clone( + source = "alpakaGlobalProducerCopyToDeviceCache", + expectXvalues = cms.vdouble([3]*10) +) process.alpakaStreamConsumer = cms.EDAnalyzer("TestAlpakaAnalyzer", source = cms.InputTag("alpakaStreamProducer"), expectSize = cms.int32(5), @@ -153,7 +163,7 @@ if args.moduleBackend != "": for name in ["ESProducerA", "ESProducerB", "ESProducerC", "ESProducerD", "ESProducerE", "ESProducerAMulti", "ESProducerNull", - "GlobalProducer", "GlobalProducerE", + "GlobalProducer", "GlobalProducerE", "GlobalProducerCopyToDeviceCache", "StreamProducer", "StreamInstanceProducer", "StreamSynchronizingProducer", "StreamSynchronizingProducerToDevice", "GlobalDeviceConsumer", "StreamDeviceConsumer", @@ -168,6 +178,8 @@ def setExpect(m, size): setExpect(process.alpakaGlobalConsumer, size=20) setExpect(process.alpakaGlobalConsumerE, size=20) process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(20-10)) + setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size=20) + process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*20 setExpect(process.alpakaStreamConsumer, size=25) setExpect(process.alpakaStreamInstanceConsumer, size=36) setExpect(process.alpakaStreamSynchronizingConsumer, size=20) @@ -178,6 +190,8 @@ def setExpect(m, size): setExpect(process.alpakaGlobalConsumer, size = 30) setExpect(process.alpakaGlobalConsumerE, size = 30) process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(30-10)) + setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size = 30) + process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*30 setExpect(process.alpakaStreamConsumer, size = 125) setExpect(process.alpakaStreamInstanceConsumer, size = 216) setExpect(process.alpakaStreamSynchronizingConsumer, size = 30) @@ -196,6 +210,7 @@ def setExpect(m, size): process.intProduct, process.alpakaGlobalProducer, process.alpakaGlobalProducerE, + process.alpakaGlobalProducerCopyToDeviceCache, process.alpakaStreamProducer, process.alpakaStreamInstanceProducer, process.alpakaStreamSynchronizingProducer, @@ -205,6 +220,7 @@ def setExpect(m, size): process.alpakaGlobalConsumer+ process.alpakaGlobalDeviceConsumer+ process.alpakaGlobalConsumerE+ + process.alpakaGlobalConsumerCopyToDeviceCache+ process.alpakaStreamConsumer+ process.alpakaStreamDeviceConsumer+ process.alpakaStreamInstanceConsumer+ From d70f43b9de634b457ffee1df43a90f01eba6bcb1 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Wed, 10 Jan 2024 23:12:02 +0100 Subject: [PATCH 3/9] Add moveToDeviceAsync function --- HeterogeneousCore/AlpakaInterface/README.md | 38 +++++- .../interface/moveToDeviceAsync.h | 47 +++++++ .../AlpakaInterface/test/BuildFile.xml | 7 + .../test/alpaka/testMoveToDeviceAsync.dev.cc | 121 ++++++++++++++++++ 4 files changed, 212 insertions(+), 1 deletion(-) create mode 100644 HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc diff --git a/HeterogeneousCore/AlpakaInterface/README.md b/HeterogeneousCore/AlpakaInterface/README.md index 3d90abdf5a2b5..e6fb83eb41d19 100644 --- a/HeterogeneousCore/AlpakaInterface/README.md +++ b/HeterogeneousCore/AlpakaInterface/README.md @@ -137,7 +137,9 @@ See the previous section for considerations about the use of device-mapped memory. -## A note about copies and synchronisation +## Notes about copies and synchronisation + +### Host-to-device copy When copying data from a host buffer to a device buffer, _e.g._ with ```c++ @@ -163,6 +165,40 @@ std::memset(a_host_buffer.data(), 0x00, size); is likely to overwrite part of the buffer while the copy is still ongoing, resulting in `a_device_buffer` with incomplete and corrupted contents. +### Host-to-device move + +For host data types that are movable and not copyable one can, to +large degree, avoid worrying about the caveats above about avoiding +any operations on the host with the following utility and move semantics +```c++ +#include "HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h" +// ... +auto device_object = cms::alpakatools::moveToDeviceAsync(queue, std::move(host_object)); +``` + +Here the host-side `host_object` is _moved_ to the +`moveToDeviceAsync()` function, which returns a correponding +device-side `device_object`. In this case any subsequent use of +`host_object` is clearly "use after move", which is easier to catch in +code review or by static analysis tools than the consequences of +`alpaka::mempcy()`. + +The `cms::alpakatools::CopyToDevice<T>` class temlate must have a +specialization for the host data type (otherwise the compilation will fail). + +As mentioned above, the host data type must be movable but not +copyable (the compilation will fail with copyable types). For example, +the `PortableHostCollection` and `PortableHostObject` class templates +can be used, but Alpaka buffers can not be directly used. + +The host data object should manage memory in +[queue-ordered](#allocating-queue-ordered-host-buffers-in-device-mapped-memory) +way. If not, the object must synchronize the device and the host in +its destructor (although such synchronization is undesirable). +Otherwise, the behavior is undefined. + +### Device-to-host copy + When copying data from a device buffer to a host buffer, _e.g._ with ```c++ alpaka::memcpy(queue, a_host_buffer, a_device_buffer); diff --git a/HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h b/HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h new file mode 100644 index 0000000000000..25aab77685163 --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h @@ -0,0 +1,47 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_moveToDeviceAsync_h +#define HeterogeneousCore_AlpakaInterface_interface_moveToDeviceAsync_h + +#include <type_traits> + +#include <alpaka/alpaka.hpp> + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h" + +namespace cms::alpakatools { + /** + * This function moves the argument hostObject object to the device + * specified by the queue. Here the "move" means that the argument + * host object must not be used in the caller after this function + * has been called. + * + * The CopyToDevice class template is used to define the returned + * device object that corresponds the argument host object. For host + * device the copying is skipped, and the hostData is returned directly. + * + * The host object must either + * - allocate its memory in queue-ordered way (e.g. using make_host_buffer(TQueue, ...)), or + * - synchronize in its destructor (makes this function synchronous, so not preferred) + * If the host object uses non-queue-order-allocated memory, and + * does not synchronize in its destructor, behavior is undefined. + * + * Note that the host object type is required to be non-copyable. + * This is to avoid easy mistakes with objects that follow copy + * semantics of std::shared_ptr (that includes Alpaka buffers), that + * would allow the source memory buffer to be used via another copy + * during the asynchronous data copy to the device. + */ + template <typename TQueue, typename THostObject, typename = std::enable_if_t<alpaka::isQueue<TQueue>>> + auto moveToDeviceAsync(TQueue& queue, THostObject hostObject) { + static_assert(not(std::is_copy_constructible_v<THostObject> or std::is_copy_assignable_v<THostObject>), + "The data object to be moved to device must not be copyable."); + + if constexpr (std::is_same_v<alpaka::Dev<TQueue>, alpaka_common::DevHost>) { + return hostObject; + } else { + return CopyToDevice<THostObject>::copyAsync(queue, hostObject); + } + } +} // namespace cms::alpakatools + +#endif diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml index 77aa10cc5a171..4c0d1ffff0b27 100644 --- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml +++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml @@ -26,6 +26,13 @@ <flags ALPAKA_BACKENDS="1"/> </bin> +<bin name="alpakaMoveToDeviceAsync" file="alpaka/testMoveToDeviceAsync.dev.cc"> + <use name="alpaka"/> + <use name="catch2"/> + <use name="HeterogeneousCore/AlpakaInterface"/> + <flags ALPAKA_BACKENDS="1"/> +</bin> + <bin name="alpakaTestBackend" file="testBackend.cc"> <use name="catch2"/> <use name="HeterogeneousCore/AlpakaInterface"/> diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc new file mode 100644 index 0000000000000..1e3b7f80b318f --- /dev/null +++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testMoveToDeviceAsync.dev.cc @@ -0,0 +1,121 @@ +#include <optional> +#include <type_traits> + +#include <alpaka/alpaka.hpp> + +#define CATCH_CONFIG_MAIN +#include <catch.hpp> + +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "HeterogeneousCore/AlpakaInterface/interface/CopyToDevice.h" +#include "HeterogeneousCore/AlpakaInterface/interface/memory.h" +#include "HeterogeneousCore/AlpakaInterface/interface/moveToDeviceAsync.h" +#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" + +// each test binary is built for a single Alpaka backend +using namespace ALPAKA_ACCELERATOR_NAMESPACE; + +namespace { + template <typename T> + class TestHostBuffer { + public: + using Buffer = cms::alpakatools::host_buffer<T[]>; + using ConstBuffer = cms::alpakatools::const_host_buffer<T[]>; + + template <typename TQueue> + TestHostBuffer(TQueue const& queue, int size) : buffer_(cms::alpakatools::make_host_buffer<T[]>(queue, size)) {} + + TestHostBuffer(TestHostBuffer const&) = delete; + TestHostBuffer& operator=(TestHostBuffer const&) = delete; + ; + TestHostBuffer(TestHostBuffer&& other) { + buffer_ = std::move(*other.buffer_); + other.buffer_.reset(); + } + TestHostBuffer& operator=(TestHostBuffer& other) { + buffer_ = std::move(*other.buffer_); + other.buffer_.reset(); + return this; + } + + bool has_value() const { return buffer_.has_value(); } + + T* data() { return buffer_->data(); } + + Buffer buffer() { return *buffer_; } + ConstBuffer buffer() const { return *buffer_; } + + private: + std::optional<Buffer> buffer_; + }; + + template <typename T, typename TDev> + class TestDeviceBuffer { + public: + using Buffer = cms::alpakatools::device_buffer<TDev, T[]>; + + template <typename TQueue> + TestDeviceBuffer(TQueue const& queue, int size) : buffer_(cms::alpakatools::make_device_buffer<T[]>(queue, size)) {} + + T* data() { return buffer_.data(); } + + Buffer buffer() { return buffer_; } + + private: + Buffer buffer_; + }; + + template <typename T> + void fillBuffer(TestHostBuffer<T>& buffer) { + for (int i = 0, size = alpaka::getExtentProduct(buffer.buffer()); i < size; ++i) { + buffer.data()[i] = i; + } + } +} // namespace + +namespace cms::alpakatools { + template <typename T> + struct CopyToDevice<TestHostBuffer<T>> { + template <typename TQueue> + static auto copyAsync(TQueue& queue, TestHostBuffer<T> const& hostBuffer) { + TestDeviceBuffer<T, alpaka::Dev<TQueue>> deviceBuffer(queue, alpaka::getExtentProduct(hostBuffer.buffer())); + alpaka::memcpy(queue, deviceBuffer.buffer(), hostBuffer.buffer()); + return deviceBuffer; + } + }; +} // namespace cms::alpakatools + +TEST_CASE("Test moveToDeviceAsync() for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend", + "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") { + // run the test on each device + for (auto const& device : cms::alpakatools::devices<Platform>()) { + auto queue = Queue(device); + constexpr int size = 32; + TestHostBuffer<int> buffer_host(queue, size); + fillBuffer(buffer_host); + auto const* ptr_host = buffer_host.data(); + + auto buffer_device = cms::alpakatools::moveToDeviceAsync(queue, std::move(buffer_host)); + REQUIRE(not buffer_host.has_value()); + if constexpr (std::is_same_v<Device, alpaka_common::DevHost>) { + REQUIRE(buffer_device.data() == ptr_host); + } else { + REQUIRE(buffer_device.data() != ptr_host); + } + alpaka::exec<Acc1D>( + queue, + cms::alpakatools::make_workdiv<Acc1D>(1, size), + [] ALPAKA_FN_ACC(Acc1D const& acc, int const* data) { + for (int i : cms::alpakatools::uniform_elements(acc)) { + assert(data[i] == i); + } + }, + buffer_device.data()); + alpaka::wait(queue); + + /* the following should not compile + auto buffer2_host = cms::alpakatools::make_host_buffer<int>(); + auto buffer2_device = cms::alpakatools::moveToDeviceAsync(queue, std::move(buffer2_host)); + */ + } +} From 60da57efd157612e118b46a5190ce9caed149290 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Fri, 12 Jan 2024 23:15:49 +0100 Subject: [PATCH 4/9] Add MoveToDeviceCache class template --- .../AlpakaCore/interface/MoveToDeviceCache.h | 108 ++++++++++++++++++ .../AlpakaTest/plugins/BuildFile.xml | 1 + ...stAlpakaGlobalProducerMoveToDeviceCache.cc | 84 ++++++++++++++ .../AlpakaTest/test/testAlpakaModules_cfg.py | 19 ++- 4 files changed, 211 insertions(+), 1 deletion(-) create mode 100644 HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h create mode 100644 HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc diff --git a/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h new file mode 100644 index 0000000000000..2f5bcbb765bac --- /dev/null +++ b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h @@ -0,0 +1,108 @@ +#ifndef HeterogeneousCore_AlpakaInterface_interface_MoveToDeviceCache_h +#define HeterogeneousCore_AlpakaInterface_interface_MoveToDeviceCache_h + +#include <type_traits> + +#include <alpaka/alpaka.hpp> + +#include "HeterogeneousCore/AlpakaCore/interface/QueueCache.h" +#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h" +#include "HeterogeneousCore/AlpakaInterface/interface/devices.h" + +namespace cms::alpakatools { + namespace detail { + // By default copy the host object with CopyToDevice<T> + // + // Doing with template specialization (rather than + // std::conditional_t and if constexpr) because the + // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for + // PortableCollection on host device + template <typename TDev, typename TQueue, typename THostObject> + class MoveToDeviceCacheImpl { + public: + using HostObject = THostObject; + using Impl = CopyToDeviceCacheImpl<TDev, TQueue, THostObject>; + using DeviceObject = typename Impl::DeviceObject; + + MoveToDeviceCacheImpl(HostObject&& srcObject) : impl_(srcObject) {} + + DeviceObject const& get(size_t i) const { return impl_.get(i); } + + private: + Impl impl_; + }; + + // For host device, move the host object instead + template <typename TQueue, typename THostObject> + class MoveToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> { + public: + using HostObject = THostObject; + using DeviceObject = HostObject; + + MoveToDeviceCacheImpl(HostObject&& srcObject) : data_(std::move(srcObject)) {} + + DeviceObject const& get(size_t i) const { return data_; } + + private: + HostObject data_; + }; + } // namespace detail + + /** + * This class template implements a cache for data that is moved + * from the host (of type THostObject) to all the devices + * corresponding the TQueue queue type. + * + * The host-side object to be moved is given as an argument to the + * class constructor. The constructor uses the + * CopyToDevice<THostObject> class template to copy the data to the + * devices, and waits for the data copies to finish, i.e. the + * constructor is synchronous wrt. the data copies. The "move" is + * achieved by requiring the constructor argument to the rvalue + * reference. + * + * Note that the host object type is required to be non-copyable. + * This is to avoid easy mistakes with objects that follow copy + * semantics of std::shared_ptr (that includes Alpaka buffers), that + * would allow the source memory buffer to be used via another copy + * during the asynchronous data copy to the device. + * + * The device-side object corresponding to the THostObject (actual + * type is the return type of CopyToDevice<THostObject>::copyAsync()) + * can be obtained with get() member function, that has either the + * queue or device argument. + * + * TODO: In principle it would be better to template over Device, + * but then we'd need a way to have a "default queue" type for each + * Device in order to infer the return type of + * CopyToDevice::copyAsync(). Alternatively, the template over + * TQueue could be removed by moving the class definition to + * ALPAKA_ACCELERATOR_NAMESPACE. + */ + template <typename TQueue, typename THostObject> + class MoveToDeviceCache { + public: + using Queue = TQueue; + using Device = alpaka::Dev<Queue>; + using HostObject = THostObject; + using Impl = detail::MoveToDeviceCacheImpl<Device, Queue, HostObject>; + using DeviceObject = typename Impl::DeviceObject; + + static_assert(not(std::is_copy_constructible_v<HostObject> or std::is_copy_assignable_v<HostObject>), + "The data object to be moved to device must not be copyable."); + + MoveToDeviceCache(HostObject&& srcData) : data_(std::move(srcData)) {} + + // TODO: I could make this function to return the contained object + // in case of alpaka buffer, PortableObject, or PortableCollection + // (in PortableCollection case it would be the View) + DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); } + + DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); } + + private: + Impl data_; + }; +} // namespace cms::alpakatools + +#endif diff --git a/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml b/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml index a4058755409f7..53652048838f7 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml +++ b/HeterogeneousCore/AlpakaTest/plugins/BuildFile.xml @@ -19,6 +19,7 @@ The dependency on "DataFormats/PortableTestObjects" automatically expands to include the host-only library (if it exists) and the corresponding Alpaka libraries (if they exist) --> + <use name="DataFormats/Portable"/> <use name="DataFormats/PortableTestObjects"/> <use name="DataFormats/TestObjects"/> <use name="FWCore/Framework"/> diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc new file mode 100644 index 0000000000000..4ca7888002872 --- /dev/null +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc @@ -0,0 +1,84 @@ +#include "DataFormats/Portable/interface/PortableObject.h" +#include "DataFormats/PortableTestObjects/interface/alpaka/TestDeviceCollection.h" +#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h" +#include "FWCore/ParameterSet/interface/ParameterSet.h" +#include "FWCore/ParameterSet/interface/ParameterSetDescription.h" +#include "FWCore/Utilities/interface/InputTag.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/global/EDProducer.h" +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/EDPutToken.h" +#include "HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" + +#include "TestAlgo.h" + +namespace ALPAKA_ACCELERATOR_NAMESPACE { + /** + * This class demonstrates a global EDProducer that + * - uses a MoveToDeviceCache to copy some host-side data to the devices of the backend. + * - produces a device EDProduct (that can get transferred to host automatically) + */ + class TestAlpakaGlobalProducerMoveToDeviceCache : public global::EDProducer<> { + public: + TestAlpakaGlobalProducerMoveToDeviceCache(edm::ParameterSet const& config) + : getToken_(consumes(config.getParameter<edm::InputTag>("source"))), + getTokenMulti2_(consumes(config.getParameter<edm::InputTag>("source"))), + getTokenMulti3_(consumes(config.getParameter<edm::InputTag>("source"))), + putToken_{produces()}, + putTokenMulti2_{produces()}, + putTokenMulti3_{produces()}, + // create host-side object that gets implicitly copied to all devices of the backend + deviceCache_{[&config]() { + PortableHostObject<TestAlgo::UpdateInfo> obj(cms::alpakatools::host()); + *obj = TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"), + config.getParameter<int32_t>("y"), + config.getParameter<int32_t>("z")}; + return obj; + }()} {} + + void produce(edm::StreamID, device::Event& iEvent, device::EventSetup const& iSetup) const override { + auto const& input = iEvent.get(getToken_); + auto const& inputMulti2 = iEvent.get(getTokenMulti2_); + auto const& inputMulti3 = iEvent.get(getTokenMulti3_); + + // get the object corresponding to the Device the Event is being processed on + auto const& infoObj = deviceCache_.get(iEvent.queue()); + + // run the algorithm, potentially asynchronously + auto deviceProduct = algo_.update(iEvent.queue(), input, infoObj.data()); + auto deviceProductMulti2 = algo_.updateMulti2(iEvent.queue(), inputMulti2, infoObj.data()); + auto deviceProductMulti3 = algo_.updateMulti3(iEvent.queue(), inputMulti3, infoObj.data()); + + iEvent.emplace(putToken_, std::move(deviceProduct)); + iEvent.emplace(putTokenMulti2_, std::move(deviceProductMulti2)); + iEvent.emplace(putTokenMulti3_, std::move(deviceProductMulti3)); + } + + static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) { + edm::ParameterSetDescription desc; + + desc.add("source", edm::InputTag{}); + desc.add<int32_t>("x", 0); + desc.add<int32_t>("y", 1); + desc.add<int32_t>("z", 2); + + descriptions.addWithDefaultLabel(desc); + } + + private: + const device::EDGetToken<portabletest::TestDeviceCollection> getToken_; + const device::EDGetToken<portabletest::TestDeviceMultiCollection2> getTokenMulti2_; + const device::EDGetToken<portabletest::TestDeviceMultiCollection3> getTokenMulti3_; + const device::EDPutToken<portabletest::TestDeviceCollection> putToken_; + const device::EDPutToken<portabletest::TestDeviceMultiCollection2> putTokenMulti2_; + const device::EDPutToken<portabletest::TestDeviceMultiCollection3> putTokenMulti3_; + + // implementation of the algorithm + TestAlgo algo_; + + cms::alpakatools::MoveToDeviceCache<Queue, PortableHostObject<TestAlgo::UpdateInfo>> deviceCache_; + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE + +#include "HeterogeneousCore/AlpakaCore/interface/alpaka/MakerMacros.h" +DEFINE_FWK_ALPAKA_MODULE(TestAlpakaGlobalProducerMoveToDeviceCache); diff --git a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py index bdacd11d7e4c2..ac39117119cce 100644 --- a/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py +++ b/HeterogeneousCore/AlpakaTest/test/testAlpakaModules_cfg.py @@ -84,6 +84,12 @@ y = cms.int32(4), z = cms.int32(5), ) +process.alpakaGlobalProducerMoveToDeviceCache = cms.EDProducer("TestAlpakaGlobalProducerMoveToDeviceCache@alpaka", + source = cms.InputTag("alpakaGlobalProducer"), + x = cms.int32(32), + y = cms.int32(42), + z = cms.int32(52), +) process.alpakaStreamProducer = cms.EDProducer("TestAlpakaStreamProducer@alpaka", source = cms.InputTag("intProduct"), eventSetupSource = cms.ESInputTag("alpakaESProducerB", "explicitLabel"), @@ -132,6 +138,10 @@ source = "alpakaGlobalProducerCopyToDeviceCache", expectXvalues = cms.vdouble([3]*10) ) +process.alpakaGlobalConsumerMoveToDeviceCache = process.alpakaGlobalConsumer.clone( + source = "alpakaGlobalProducerMoveToDeviceCache", + expectXvalues = cms.vdouble([32]*10) +) process.alpakaStreamConsumer = cms.EDAnalyzer("TestAlpakaAnalyzer", source = cms.InputTag("alpakaStreamProducer"), expectSize = cms.int32(5), @@ -163,7 +173,8 @@ if args.moduleBackend != "": for name in ["ESProducerA", "ESProducerB", "ESProducerC", "ESProducerD", "ESProducerE", "ESProducerAMulti", "ESProducerNull", - "GlobalProducer", "GlobalProducerE", "GlobalProducerCopyToDeviceCache", + "GlobalProducer", "GlobalProducerE", + "GlobalProducerCopyToDeviceCache", "GlobalProducerMoveToDeviceCache", "StreamProducer", "StreamInstanceProducer", "StreamSynchronizingProducer", "StreamSynchronizingProducerToDevice", "GlobalDeviceConsumer", "StreamDeviceConsumer", @@ -180,6 +191,8 @@ def setExpect(m, size): process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(20-10)) setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size=20) process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*20 + setExpect(process.alpakaGlobalConsumerMoveToDeviceCache, size=20) + process.alpakaGlobalConsumerMoveToDeviceCache.expectXvalues = [32]*20 setExpect(process.alpakaStreamConsumer, size=25) setExpect(process.alpakaStreamInstanceConsumer, size=36) setExpect(process.alpakaStreamSynchronizingConsumer, size=20) @@ -192,6 +205,8 @@ def setExpect(m, size): process.alpakaGlobalConsumerE.expectXvalues.extend([0]*(30-10)) setExpect(process.alpakaGlobalConsumerCopyToDeviceCache, size = 30) process.alpakaGlobalConsumerCopyToDeviceCache.expectXvalues = [3]*30 + setExpect(process.alpakaGlobalConsumerMoveToDeviceCache, size = 30) + process.alpakaGlobalConsumerMoveToDeviceCache.expectXvalues = [32]*30 setExpect(process.alpakaStreamConsumer, size = 125) setExpect(process.alpakaStreamInstanceConsumer, size = 216) setExpect(process.alpakaStreamSynchronizingConsumer, size = 30) @@ -211,6 +226,7 @@ def setExpect(m, size): process.alpakaGlobalProducer, process.alpakaGlobalProducerE, process.alpakaGlobalProducerCopyToDeviceCache, + process.alpakaGlobalProducerMoveToDeviceCache, process.alpakaStreamProducer, process.alpakaStreamInstanceProducer, process.alpakaStreamSynchronizingProducer, @@ -221,6 +237,7 @@ def setExpect(m, size): process.alpakaGlobalDeviceConsumer+ process.alpakaGlobalConsumerE+ process.alpakaGlobalConsumerCopyToDeviceCache+ + process.alpakaGlobalConsumerMoveToDeviceCache+ process.alpakaStreamConsumer+ process.alpakaStreamDeviceConsumer+ process.alpakaStreamInstanceConsumer+ From 61a5c3a11ca1e088f0ebb685ff59f88326d1ff1f Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Thu, 8 Feb 2024 23:13:49 +0100 Subject: [PATCH 5/9] Allow the contained object of PortableHostObject to be initialized in the constructor --- .../Portable/interface/PortableHostObject.h | 22 +++++++++++ .../test/test_catch2_portableObjectOnHost.cc | 38 +++++++++++++++++-- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/DataFormats/Portable/interface/PortableHostObject.h b/DataFormats/Portable/interface/PortableHostObject.h index b2f84b38be2dc..a2051a6ff2ab9 100644 --- a/DataFormats/Portable/interface/PortableHostObject.h +++ b/DataFormats/Portable/interface/PortableHostObject.h @@ -20,16 +20,30 @@ class PortableHostObject { using Buffer = cms::alpakatools::host_buffer<Product>; using ConstBuffer = cms::alpakatools::const_host_buffer<Product>; + static_assert(std::is_trivially_destructible_v<Product>); + PortableHostObject() = delete; PortableHostObject(edm::Uninitialized) noexcept {} + // Note that in contrast to the variadic template overload, this + // constructor does not initialize the contained object PortableHostObject(alpaka_common::DevHost const& host) // allocate pageable host memory : buffer_{cms::alpakatools::make_host_buffer<Product>()}, product_{buffer_->data()} { assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0); } + template <typename... Args> + PortableHostObject(alpaka_common::DevHost const& host, Args&&... args) + // allocate pageable host memory + : buffer_{cms::alpakatools::make_host_buffer<Product>()}, + product_{new(buffer_->data()) Product(std::forward<Args>(args)...)} { + assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0); + } + + // Note that in contrast to the variadic template overload, this + // constructor does not initialize the contained object template <typename TQueue, typename = std::enable_if_t<alpaka::isQueue<TQueue>>> PortableHostObject(TQueue const& queue) // allocate pinned host memory associated to the given work queue, accessible by the queue's device @@ -37,6 +51,14 @@ class PortableHostObject { assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0); } + template <typename TQueue, typename... Args, typename = std::enable_if_t<alpaka::isQueue<TQueue>>> + PortableHostObject(TQueue const& queue, Args&&... args) + // allocate pinned host memory associated to the given work queue, accessible by the queue's device + : buffer_{cms::alpakatools::make_host_buffer<Product>(queue)}, + product_{new(buffer_->data()) Product(std::forward<Args>(args)...)} { + assert(reinterpret_cast<uintptr_t>(product_) % alignof(Product) == 0); + } + // non-copyable PortableHostObject(PortableHostObject const&) = delete; PortableHostObject& operator=(PortableHostObject const&) = delete; diff --git a/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc b/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc index 698605b57f465..4afe56be3e322 100644 --- a/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc +++ b/DataFormats/Portable/test/test_catch2_portableObjectOnHost.cc @@ -14,10 +14,40 @@ namespace { // This test is currently mostly about the code compiling TEST_CASE("Use of PortableObject<T> on host code", s_tag) { - PortableObject<Test, alpaka::DevCpu> obj(cms::alpakatools::host()); - obj->a = 42; + static_assert(std::is_same_v<PortableObject<Test, alpaka::DevCpu>, PortableHostObject<Test>>); - SECTION("Tests") { REQUIRE(obj->a == 42); } + SECTION("Initialize by setting members") { + SECTION("With device") { + PortableObject<Test, alpaka::DevCpu> obj(cms::alpakatools::host()); + obj->a = 42; - static_assert(std::is_same_v<PortableObject<Test, alpaka::DevCpu>, PortableHostObject<Test>>); + REQUIRE(obj->a == 42); + } + + SECTION("With queue") { + alpaka::QueueCpuBlocking queue(cms::alpakatools::host()); + + PortableObject<Test, alpaka::DevCpu> obj(queue); + obj->a = 42; + + REQUIRE(obj->a == 42); + } + } + + SECTION("Initialize via constructor") { + SECTION("With device") { + PortableObject<Test, alpaka::DevCpu> obj(cms::alpakatools::host(), Test{42, 3.14f}); + + REQUIRE(obj->a == 42); + REQUIRE(obj->b == 3.14f); + } + + SECTION("With queue") { + alpaka::QueueCpuBlocking queue(cms::alpakatools::host()); + PortableObject<Test, alpaka::DevCpu> obj(queue, Test{42, 3.14f}); + + REQUIRE(obj->a == 42); + REQUIRE(obj->b == 3.14f); + } + } } From 3d6f3204b8d455cce075a174845700f46ee2b29a Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Thu, 8 Feb 2024 23:20:42 +0100 Subject: [PATCH 6/9] Replace lambda with a direct initialization of PortableHostObject --- .../TestAlpakaGlobalProducerMoveToDeviceCache.cc | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc index 4ca7888002872..51d756e5dbc8f 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc @@ -27,13 +27,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { putTokenMulti2_{produces()}, putTokenMulti3_{produces()}, // create host-side object that gets implicitly copied to all devices of the backend - deviceCache_{[&config]() { - PortableHostObject<TestAlgo::UpdateInfo> obj(cms::alpakatools::host()); - *obj = TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"), - config.getParameter<int32_t>("y"), - config.getParameter<int32_t>("z")}; - return obj; - }()} {} + deviceCache_{ + PortableHostObject<TestAlgo::UpdateInfo>{cms::alpakatools::host(), + TestAlgo::UpdateInfo{config.getParameter<int32_t>("x"), + config.getParameter<int32_t>("y"), + config.getParameter<int32_t>("z")}}} {} void produce(edm::StreamID, device::Event& iEvent, device::EventSetup const& iSetup) const override { auto const& input = iEvent.get(getToken_); From ad1db1651f5e6cd7f03c096724795afe92e6c932 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Fri, 9 Feb 2024 08:18:03 -0600 Subject: [PATCH 7/9] Document CopyToDeviceCache and MoveToDeviceCache in README --- HeterogeneousCore/AlpakaCore/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/HeterogeneousCore/AlpakaCore/README.md b/HeterogeneousCore/AlpakaCore/README.md index 43ed5d0778844..586db296972c9 100644 --- a/HeterogeneousCore/AlpakaCore/README.md +++ b/HeterogeneousCore/AlpakaCore/README.md @@ -185,6 +185,26 @@ In the [`fillDescriptions()`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/SWG Also note that the `fillDescription()` function must have the same content for all backends, i.e. any backend-specific behavior with e.g. `#ifdef` or `if constexpr` are forbidden. +### Copy e.g. configuration data to all devices in EDProducer + +While the EventSetup can be used to handle copying data to all devices +of an Alpaka backend, for data used only by one EDProducer a simpler +way would be to use one of +* `cms::alpakatools::MoveToDeviceCache<TQueue, THostObject>` (recommended) + * `#include "HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h"` + * Moves the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. On host backends the argument `THostObject` is moved around, but not copied. + * The `THostObject` must not be copyable + * This is to avoid easy mistakes with objects that follow copy semantics of `std::shared_ptr` (that includes Alpaka buffers), that would allow the source memory buffer to be used via another copy during the asynchronous data copy to the device. + * The constructor argument `THostObject` object may not be used, unless it is initialized again e.g. by assigning another `THostObject` into it. + * The corresponding device-side object can be obtained with `get()` member function using either alpaka Device or Queue object. It can be used immediately after the constructor returns. +* `cms::alpakatools::CopyToDeviceCache<TQueue, THostObject>` (use only if **must** use copyable `THostObject`) + * `#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h"` + * Copies the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. Also host backends do a copy. + * The constructor argument `THostObject` object can be used for other purposes immediately after the constructor returns + * The corresponding device-side object can be obtained with `get()` member function using either alpaka Device or Queue object. It can be used immediately after the constructor returns. + +For examples see [`HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc`](../../HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc) and [`HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc`](../../HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc). + ## Guarantees * All Event data products in the device memory space are guaranteed to be accessible only for operations enqueued in the `Queue` given by `device::Event::queue()` when accessed through the `device::Event`. From 9814921854dc5018a3558f151c865f1f5639f6d3 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Tue, 17 Dec 2024 22:33:38 +0100 Subject: [PATCH 8/9] Template {Copy,Move}ToDevice over Device instead of Queue --- HeterogeneousCore/AlpakaCore/README.md | 4 +- .../AlpakaCore/interface/CopyToDeviceCache.h | 37 ++++++++----------- .../AlpakaCore/interface/MoveToDeviceCache.h | 35 +++++++----------- ...stAlpakaGlobalProducerCopyToDeviceCache.cc | 2 +- ...stAlpakaGlobalProducerMoveToDeviceCache.cc | 2 +- 5 files changed, 33 insertions(+), 47 deletions(-) diff --git a/HeterogeneousCore/AlpakaCore/README.md b/HeterogeneousCore/AlpakaCore/README.md index 586db296972c9..844b14a8be92b 100644 --- a/HeterogeneousCore/AlpakaCore/README.md +++ b/HeterogeneousCore/AlpakaCore/README.md @@ -190,14 +190,14 @@ Also note that the `fillDescription()` function must have the same content for a While the EventSetup can be used to handle copying data to all devices of an Alpaka backend, for data used only by one EDProducer a simpler way would be to use one of -* `cms::alpakatools::MoveToDeviceCache<TQueue, THostObject>` (recommended) +* `cms::alpakatools::MoveToDeviceCache<TDevice, THostObject>` (recommended) * `#include "HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h"` * Moves the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. On host backends the argument `THostObject` is moved around, but not copied. * The `THostObject` must not be copyable * This is to avoid easy mistakes with objects that follow copy semantics of `std::shared_ptr` (that includes Alpaka buffers), that would allow the source memory buffer to be used via another copy during the asynchronous data copy to the device. * The constructor argument `THostObject` object may not be used, unless it is initialized again e.g. by assigning another `THostObject` into it. * The corresponding device-side object can be obtained with `get()` member function using either alpaka Device or Queue object. It can be used immediately after the constructor returns. -* `cms::alpakatools::CopyToDeviceCache<TQueue, THostObject>` (use only if **must** use copyable `THostObject`) +* `cms::alpakatools::CopyToDeviceCache<TDevice, THostObject>` (use only if **must** use copyable `THostObject`) * `#include "HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h"` * Copies the `THostObject` to all devices using `cms::alpakatools::CopyToDevice<THostObject>` synchronously. Also host backends do a copy. * The constructor argument `THostObject` object can be used for other purposes immediately after the constructor returns diff --git a/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h index f33c8c170c90c..3e3a04f0c1834 100644 --- a/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h +++ b/HeterogeneousCore/AlpakaCore/interface/CopyToDeviceCache.h @@ -15,11 +15,11 @@ namespace cms::alpakatools { // std::conditional_t and if constexpr) because the // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for // PortableCollection on host device - template <typename TDev, typename TQueue, typename THostObject> + template <typename TDevice, typename THostObject> class CopyToDeviceCacheImpl { public: - using Device = TDev; - using Queue = TQueue; + using Device = TDevice; + using Queue = alpaka::Queue<Device, alpaka::NonBlocking>; using HostObject = THostObject; using Copy = CopyToDevice<HostObject>; using DeviceObject = decltype(Copy::copyAsync(std::declval<Queue&>(), std::declval<HostObject const&>())); @@ -47,8 +47,8 @@ namespace cms::alpakatools { }; // For host device, copy the host object directly instead - template <typename TQueue, typename THostObject> - class CopyToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> { + template <typename THostObject> + class CopyToDeviceCacheImpl<alpaka_common::DevHost, THostObject> { public: using HostObject = THostObject; using DeviceObject = HostObject; @@ -63,9 +63,9 @@ namespace cms::alpakatools { } // namespace detail /** - * This class template implements a cache for data that is copied + * This class template implements a cache for data that is moved * from the host (of type THostObject) to all the devices - * corresponding the TQueue queue type. + * corresponding to the TDevice device type. * * The host-side object to be copied is given as an argument to the * class constructor. The constructor uses the @@ -77,31 +77,24 @@ namespace cms::alpakatools { * type is the return type of CopyToDevice<THostObject>::copyAsync()) * can be obtained with get() member function, that has either the * queue or device argument. - * - * TODO: In principle it would be better to template over Device, - * but then we'd need a way to have a "default queue" type for each - * Device in order to infer the return type of - * CopyToDevice::copyAsync(). Alternatively, the template over - * TQueue could be removed by moving the class definition to - * ALPAKA_ACCELERATOR_NAMESPACE. */ - template <typename TQueue, typename THostObject> + template <typename TDevice, typename THostObject> + requires alpaka::isDevice<TDevice> class CopyToDeviceCache { - using Queue = TQueue; - using Device = alpaka::Dev<Queue>; + using Device = TDevice; using HostObject = THostObject; - using Impl = detail::CopyToDeviceCacheImpl<Device, Queue, HostObject>; + using Impl = detail::CopyToDeviceCacheImpl<Device, HostObject>; using DeviceObject = typename Impl::DeviceObject; public: CopyToDeviceCache(THostObject const& srcData) : data_(srcData) {} - // TODO: I could make this function to return the contained object - // in case of alpaka buffer, PortableObject, or PortableCollection - // (in PortableCollection case it would be the View) DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); } - DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); } + template <typename TQueue> + DeviceObject const& get(TQueue const& queue) const { + return get(alpaka::getDev(queue)); + } private: Impl data_; diff --git a/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h index 2f5bcbb765bac..2c66fd384798a 100644 --- a/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h +++ b/HeterogeneousCore/AlpakaCore/interface/MoveToDeviceCache.h @@ -17,11 +17,11 @@ namespace cms::alpakatools { // std::conditional_t and if constexpr) because the // CopyToDevice<THostObject>::copyAsync() is ill-defined e.g. for // PortableCollection on host device - template <typename TDev, typename TQueue, typename THostObject> + template <typename TDevice, typename THostObject> class MoveToDeviceCacheImpl { public: using HostObject = THostObject; - using Impl = CopyToDeviceCacheImpl<TDev, TQueue, THostObject>; + using Impl = CopyToDeviceCacheImpl<TDevice, THostObject>; using DeviceObject = typename Impl::DeviceObject; MoveToDeviceCacheImpl(HostObject&& srcObject) : impl_(srcObject) {} @@ -33,8 +33,8 @@ namespace cms::alpakatools { }; // For host device, move the host object instead - template <typename TQueue, typename THostObject> - class MoveToDeviceCacheImpl<alpaka_common::DevHost, TQueue, THostObject> { + template <typename THostObject> + class MoveToDeviceCacheImpl<alpaka_common::DevHost, THostObject> { public: using HostObject = THostObject; using DeviceObject = HostObject; @@ -51,14 +51,14 @@ namespace cms::alpakatools { /** * This class template implements a cache for data that is moved * from the host (of type THostObject) to all the devices - * corresponding the TQueue queue type. + * corresponding to the TDevice device type. * * The host-side object to be moved is given as an argument to the * class constructor. The constructor uses the * CopyToDevice<THostObject> class template to copy the data to the * devices, and waits for the data copies to finish, i.e. the * constructor is synchronous wrt. the data copies. The "move" is - * achieved by requiring the constructor argument to the rvalue + * achieved by requiring the constructor argument to be an rvalue * reference. * * Note that the host object type is required to be non-copyable. @@ -71,21 +71,14 @@ namespace cms::alpakatools { * type is the return type of CopyToDevice<THostObject>::copyAsync()) * can be obtained with get() member function, that has either the * queue or device argument. - * - * TODO: In principle it would be better to template over Device, - * but then we'd need a way to have a "default queue" type for each - * Device in order to infer the return type of - * CopyToDevice::copyAsync(). Alternatively, the template over - * TQueue could be removed by moving the class definition to - * ALPAKA_ACCELERATOR_NAMESPACE. */ - template <typename TQueue, typename THostObject> + template <typename TDevice, typename THostObject> + requires alpaka::isDevice<TDevice> class MoveToDeviceCache { public: - using Queue = TQueue; - using Device = alpaka::Dev<Queue>; + using Device = TDevice; using HostObject = THostObject; - using Impl = detail::MoveToDeviceCacheImpl<Device, Queue, HostObject>; + using Impl = detail::MoveToDeviceCacheImpl<Device, HostObject>; using DeviceObject = typename Impl::DeviceObject; static_assert(not(std::is_copy_constructible_v<HostObject> or std::is_copy_assignable_v<HostObject>), @@ -93,12 +86,12 @@ namespace cms::alpakatools { MoveToDeviceCache(HostObject&& srcData) : data_(std::move(srcData)) {} - // TODO: I could make this function to return the contained object - // in case of alpaka buffer, PortableObject, or PortableCollection - // (in PortableCollection case it would be the View) DeviceObject const& get(Device const& dev) const { return data_.get(alpaka::getNativeHandle(dev)); } - DeviceObject const& get(Queue const& queue) const { return get(alpaka::getDev(queue)); } + template <typename TQueue> + DeviceObject const& get(TQueue const& queue) const { + return get(alpaka::getDev(queue)); + } private: Impl data_; diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc index 331b85d093bda..8d8ea5b7ac181 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerCopyToDeviceCache.cc @@ -74,7 +74,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // implementation of the algorithm TestAlgo algo_; - cms::alpakatools::CopyToDeviceCache<Queue, cms::alpakatools::host_buffer<TestAlgo::UpdateInfo>> deviceCache_; + cms::alpakatools::CopyToDeviceCache<Device, cms::alpakatools::host_buffer<TestAlgo::UpdateInfo>> deviceCache_; }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc index 51d756e5dbc8f..6c8fe100f0ec5 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlpakaGlobalProducerMoveToDeviceCache.cc @@ -73,7 +73,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { // implementation of the algorithm TestAlgo algo_; - cms::alpakatools::MoveToDeviceCache<Queue, PortableHostObject<TestAlgo::UpdateInfo>> deviceCache_; + cms::alpakatools::MoveToDeviceCache<Device, PortableHostObject<TestAlgo::UpdateInfo>> deviceCache_; }; } // namespace ALPAKA_ACCELERATOR_NAMESPACE From 25fe7a6f4aac8d8c368c04631f04bd35eace7032 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen <matti.kortelainen@cern.ch> Date: Tue, 17 Dec 2024 15:42:38 -0600 Subject: [PATCH 9/9] Use Acc1D directly in TestAlgo Co-authored-by: Andrea Bocci <fwyzard@gmail.com> --- .../AlpakaTest/plugins/alpaka/TestAlgo.dev.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc index a9034ead09e0f..53d7318c907df 100644 --- a/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc +++ b/HeterogeneousCore/AlpakaTest/plugins/alpaka/TestAlgo.dev.cc @@ -177,8 +177,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } } - template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, portabletest::TestDeviceCollection::ConstView input, TestAlgo::UpdateInfo const* updateInfo, portabletest::TestDeviceCollection::View output) const { @@ -228,8 +227,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } } - template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, portabletest::TestSoA::ConstView input, portabletest::TestSoA2::ConstView input2, TestAlgo::UpdateInfo const* updateInfo, @@ -299,8 +297,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } } - template <typename TAcc, typename = std::enable_if_t<alpaka::isAccelerator<TAcc>>> - ALPAKA_FN_ACC void operator()(TAcc const& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, portabletest::TestSoA::ConstView input, portabletest::TestSoA2::ConstView input2, portabletest::TestSoA3::ConstView input3,