From 3dfb1a9f97c149aea43cd6206626ad188a0b1e06 Mon Sep 17 00:00:00 2001
From: Andrea Bocci <andrea.bocci@cern.ch>
Date: Thu, 21 Dec 2023 00:58:05 +0100
Subject: [PATCH] Add a test for independent_groups and
 independent_group_elements

---
 .../AlpakaInterface/test/BuildFile.xml        |   7 +
 .../test/alpaka/testIndependentKernel.dev.cc  | 144 ++++++++++++++++++
 2 files changed, 151 insertions(+)
 create mode 100644 HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc
diff --git a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
index 5f9c5fe81981f..2d204819d740b 100644
--- a/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
+++ b/HeterogeneousCore/AlpakaInterface/test/BuildFile.xml
@@ -12,6 +12,13 @@
   <flags ALPAKA_BACKENDS="1"/>
 </bin>
 
+<bin name="alpakaTestIndependentKernel" file="alpaka/testIndependentKernel.dev.cc">
+  <use name="alpaka"/>
+  <use name="catch2"/>
+  <use name="HeterogeneousCore/AlpakaInterface"/>
+  <flags ALPAKA_BACKENDS="1"/>
+</bin>
+
 <bin name="alpakaTestBackend" file="testBackend.cc">
   <use name="catch2"/>
   <use name="HeterogeneousCore/AlpakaInterface"/>
diff --git a/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc b/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc
new file mode 100644
index 0000000000000..bd98efcfa32d6
--- /dev/null
+++ b/HeterogeneousCore/AlpakaInterface/test/alpaka/testIndependentKernel.dev.cc
@@ -0,0 +1,144 @@
+#include <cstdio>
+#include <random>
+
+#include <alpaka/alpaka.hpp>
+
+#define CATCH_CONFIG_MAIN
+#include <catch.hpp>
+
+#include "HeterogeneousCore/AlpakaInterface/interface/config.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/memory.h"
+#include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h"
+
+// each test binary is built for a single Alpaka backend
+using namespace ALPAKA_ACCELERATOR_NAMESPACE;
+
+/* Add the group id to te value of each element in the group.
+ * Each group is composed by the elements first[group]..first[group+1]-1 .
+ */
+struct IndependentWorkKernel {
+  template <typename TAcc, typename T>
+  ALPAKA_FN_ACC void operator()(TAcc const& acc,
+                                T const* __restrict__ in,
+                                T* __restrict__ out,
+                                size_t const* __restrict__ indices,
+                                size_t groups) const {
+    for (auto group : cms::alpakatools::independent_groups(acc, groups)) {
+      size_t first = indices[group];
+      size_t last = indices[group + 1];
+      size_t size = last - first;
+      for (auto index : cms::alpakatools::independent_group_elements(acc, size)) {
+        out[first + index] = in[first + index] + group;
+      }
+    }
+  }
+};
+
+/* Test the IndependentWorkKernel kernel on all devices
+ */
+template <typename TKernel>
+void testIndependentWorkKernel(size_t groups, size_t grid_size, size_t block_size, TKernel kernel) {
+  // random number generator with a gaussian distribution
+  std::random_device rd{};
+  std::default_random_engine engine{rd()};
+
+  // uniform distribution
+  std::uniform_int_distribution<size_t> random_size{100, 201};
+
+  // gaussian distribution
+  std::normal_distribution<float> dist{0., 1.};
+
+  // build the groups
+  std::vector<size_t> sizes(groups);
+  auto indices_h = cms::alpakatools::make_host_buffer<size_t[], Platform>(groups + 1);
+  indices_h[0] = 0;
+  for (size_t i = 0; i < groups; ++i) {
+    auto size = random_size(engine);
+    sizes[i] = size;
+    indices_h[i + 1] = indices_h[i] + size;
+  }
+
+  // tolerance
+  constexpr float epsilon = 0.000001;
+
+  // buffer size
+  const size_t size = indices_h[groups];
+
+  // allocate the input and output host buffer in pinned memory accessible by the Platform devices
+  auto in_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+  auto out_h = cms::alpakatools::make_host_buffer<float[], Platform>(size);
+
+  // fill the input buffers with random data, and the output buffer with zeros
+  for (size_t i = 0; i < size; ++i) {
+    in_h[i] = dist(engine);
+    out_h[i] = 0;
+  }
+
+  // run the test on each device
+  for (auto const& device : cms::alpakatools::devices<Platform>()) {
+    std::cout << "Test IndependentWorkKernel on " << alpaka::getName(device) << " over " << size << " elements in "
+              << groups << " independent groups with " << grid_size << " blocks of " << block_size << " elements\n";
+    auto queue = Queue(device);
+
+    // allocate input and output buffers on the device
+    auto indices_d = cms::alpakatools::make_device_buffer<size_t[]>(queue, groups + 1);
+    auto in_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+    auto out_d = cms::alpakatools::make_device_buffer<float[]>(queue, size);
+
+    // copy the input data to the device; the size is known from the buffer objects
+    alpaka::memcpy(queue, indices_d, indices_h);
+    alpaka::memcpy(queue, in_d, in_h);
+
+    // fill the output buffer with zeros; the size is known from the buffer objects
+    alpaka::memset(queue, out_d, 0.);
+
+    // launch the 1-dimensional kernel with independent work groups
+    auto div = cms::alpakatools::make_workdiv<Acc1D>(grid_size, block_size);
+    alpaka::exec<Acc1D>(queue, div, kernel, in_d.data(), out_d.data(), indices_d.data(), groups);
+
+    // copy the results from the device to the host
+    alpaka::memcpy(queue, out_h, out_d);
+
+    // wait for all the operations to complete
+    alpaka::wait(queue);
+
+    // check the results
+    for (size_t g = 0; g < groups; ++g) {
+      size_t first = indices_h[g];
+      size_t last = indices_h[g + 1];
+      for (size_t i = first; i < last; ++i) {
+        float sum = in_h[i] + g;
+        float delta = std::max(std::fabs(sum) * epsilon, epsilon);
+        REQUIRE(out_h[i] < sum + delta);
+        REQUIRE(out_h[i] > sum - delta);
+      }
+    }
+  }
+}
+
+TEST_CASE("Test alpaka kernels for the " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) " backend",
+          "[" EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE) "]") {
+  SECTION("Independent work groups") {
+    // get the list of devices on the current platform
+    auto const& devices = cms::alpakatools::devices<Platform>();
+    if (devices.empty()) {
+      INFO("No devices available on the platform " EDM_STRINGIZE(ALPAKA_ACCELERATOR_NAMESPACE));
+      REQUIRE(not devices.empty());
+    }
+
+    // launch the independent work kernel with a small block size and a small number of blocks;
+    // this relies on the kernel to loop over the "problem space" and do more work per block
+    std::cout << "Test independent work kernel with small block size, using scalar dimensions\n";
+    testIndependentWorkKernel(100, 32, 32, IndependentWorkKernel{});
+
+    // launch the independent work kernel with a large block size and a single block;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test independent work kernel with large block size, using scalar dimensions\n";
+    testIndependentWorkKernel(100, 1, 1024, IndependentWorkKernel{});
+
+    // launch the independent work kernel with a large block size and a large number of blocks;
+    // this relies on the kernel to check the size of the "problem space" and avoid accessing out-of-bounds data
+    std::cout << "Test independent work kernel with large block size, using scalar dimensions\n";
+    testIndependentWorkKernel(100, 1024, 1024, IndependentWorkKernel{});
+  }
+}