From 173420d3c3d4ecc82e0fa4058297230824e15af1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Tue, 12 Mar 2024 12:09:32 +0200
Subject: [PATCH 1/6] Enable LTO / IPO on the runtime library

Enable link-time / interprocedural optimizations on the runtime
library if available on the system.
---
 CMakeLists.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 289557420..3b98bb31b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -320,6 +320,18 @@ else()
   set(CHIP_LIB_NAME "libCHIP.a")
 endif()
 
+include(CheckIPOSupported)
+check_ipo_supported(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
+message(STATUS "Interprocedural optimizations (IPO): ${SUPPORTS_IPO}.")
+if(SUPPORTS_IPO)
+  set_property(TARGET CHIP PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+else()
+  message(NOTICE "To enable IPO with clang you may need to re-configure \
+LLVM/Clang with -DLLVM_BINUTILS_INCDIR for locating plugin-api.h from \
+binutils-dev package.")
+  message(STATUS "IPO error: ${IPO_ERROR}")
+endif()
+
 set(CHIP_INTERFACE_LIBS ${PTHREAD_LIBRARY})
 
 if(OpenCL_LIBRARY)

From 29599401bcc7dcc6baf5291ce67f4b99113c7aaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Tue, 12 Mar 2024 14:21:43 +0200
Subject: [PATCH 2/6] Add early exit from registeredVarCopy()

Exit early if there are no hipMemoryTypeManaged and hipMemoryTypeHost
allocations in the current context.
---
 src/CHIPBackend.cc |  9 +++++++--
 src/CHIPBackend.hh | 22 ++++++++++++++++++++++
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc
index 591f6c5ca..5be039de3 100644
--- a/src/CHIPBackend.cc
+++ b/src/CHIPBackend.cc
@@ -167,6 +167,7 @@ void chipstar::AllocationTracker::recordAllocation(
     // Map onto host so that the data can be potentially initialized on host
     ::Backend->getActiveDevice()->getDefaultQueue()->MemMap(
         AllocInfo, chipstar::Queue::MEM_MAP_TYPE::HOST_WRITE);
+    NumHostAllocations_ += 1;
   }
 
   if (MemoryType == hipMemoryTypeUnified)
@@ -1758,9 +1759,13 @@ chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem,
   //       the kernel does not have any, we only need inspect kernels
   //       pointer arguments for allocations to be synchronized.
 
-  std::vector<std::shared_ptr<chipstar::Event>> CopyEvents;
+  auto *AllocTracker = ::Backend->getActiveDevice()->AllocTracker;
+  if (!AllocTracker->getNumHostAllocations() &&
+      !AllocTracker->getNumManagedAllocations())
+    return nullptr; // Nothing to synchronize.
+
   auto PreKernel = ExecState == MANAGED_MEM_STATE::PRE_KERNEL;
-  auto &AllocTracker = ::Backend->getActiveDevice()->AllocTracker;
+  std::vector<std::shared_ptr<chipstar::Event>> CopyEvents;
   auto ArgVisitor = [&](const chipstar::AllocationInfo &AllocInfo) -> void {
     if (AllocInfo.MemoryType == hipMemoryTypeHost) {
       logDebug("Sync host memory {} ({})", AllocInfo.HostPtr,
diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh
index ce92533a5..8ec383700 100644
--- a/src/CHIPBackend.hh
+++ b/src/CHIPBackend.hh
@@ -500,6 +500,9 @@ private:
   std::unordered_set<chipstar::AllocationInfo *> AllocInfos_;
   std::unordered_map<void *, chipstar::AllocationInfo *> PtrToAllocInfo_;
 
+  size_t NumHostAllocations_ = 0;
+  size_t NumManagedAllocations_ = 0;
+
 public:
   mutable std::mutex AllocationTrackerMtx;
 
@@ -522,6 +525,7 @@ public:
     this->PtrToAllocInfo_[HostPtr] = AllocInfo;
     AllocInfo->MemoryType = hipMemoryTypeManaged;
     AllocInfo->IsHostRegistered = true;
+    NumManagedAllocations_ += 1;
   }
 
   size_t GlobalMemSize, TotalMemSize, MaxMemUsed;
@@ -604,6 +608,20 @@ public:
     assert(AllocInfo && "Null pointer passed to eraseRecord");
     assert(AllocInfos_.count(AllocInfo) &&
            "Not a member of the allocation tracker!");
+
+    switch (AllocInfo->MemoryType) {
+    default:
+      break;
+    case hipMemoryTypeHost:
+      assert(NumHostAllocations_ > 0);
+      NumHostAllocations_ -= 1;
+      break;
+    case hipMemoryTypeManaged:
+      assert(NumManagedAllocations_ > 0);
+      NumManagedAllocations_ -= 1;
+      break;
+    }
+
     PtrToAllocInfo_.erase(AllocInfo->DevPtr);
     if (AllocInfo->HostPtr)
       PtrToAllocInfo_.erase(AllocInfo->HostPtr);
@@ -623,6 +641,10 @@ public:
   }
 
   size_t getNumAllocations() const { return AllocInfos_.size(); }
+
+  // Return the number of host type allocations.
+  size_t getNumHostAllocations() const { return NumHostAllocations_; }
+  size_t getNumManagedAllocations() const { return NumManagedAllocations_; }
 };
 
 class DeviceVar {

From 9f578f63dcf2493bcd807fffae300bbab93bad64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Thu, 14 Mar 2024 15:23:09 +0200
Subject: [PATCH 3/6] Early exit from getSyncQueuesLastEvents()

---
 src/CHIPBackend.cc                      | 38 ++++++++++++++++---------
 src/CHIPBackend.hh                      | 14 ++++++---
 src/backend/Level0/CHIPBackendLevel0.cc |  3 +-
 src/backend/OpenCL/CHIPBackendOpenCL.cc | 16 +++++------
 4 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc
index 5be039de3..e786bce90 100644
--- a/src/CHIPBackend.cc
+++ b/src/CHIPBackend.cc
@@ -532,10 +532,9 @@ chipstar::Device::~Device() {
   if (PerThreadDefaultQueue)
     PerThreadDefaultQueue->finish();
 
-  while (this->ChipQueues_.size() > 0) {
-    delete ChipQueues_[0];
-    ChipQueues_.erase(ChipQueues_.begin());
-  }
+  for (auto *Queue : UserQueues_)
+    delete Queue;
+  UserQueues_.clear();
 
   delete LegacyDefaultQueue;
   LegacyDefaultQueue = nullptr;
@@ -849,9 +848,9 @@ void chipstar::Device::addQueue(chipstar::Queue *ChipQueue) {
   logDebug("{} Device::addQueue({})", (void *)this, (void *)ChipQueue);
 
   auto QueueFound =
-      std::find(ChipQueues_.begin(), ChipQueues_.end(), ChipQueue);
-  if (QueueFound == ChipQueues_.end()) {
-    ChipQueues_.push_back(ChipQueue);
+      std::find(UserQueues_.begin(), UserQueues_.end(), ChipQueue);
+  if (QueueFound == UserQueues_.end()) {
+    UserQueues_.push_back(ChipQueue);
   } else {
     CHIPERR_LOG_AND_THROW("Tried to add a queue to the backend which was "
                           "already present in the backend queue list",
@@ -919,19 +918,19 @@ bool chipstar::Device::removeQueue(chipstar::Queue *ChipQueue) {
    *
    * Choosing not to call Queue->finish()
    */
-  LOCK(DeviceMtx) // reading chipstar::Device::ChipQueues_
+  LOCK(DeviceMtx) // reading chipstar::Device::UserQueues_
   ChipQueue->updateLastEvent(nullptr);
 
   // Remove from device queue list
   auto FoundQueue =
-      std::find(ChipQueues_.begin(), ChipQueues_.end(), ChipQueue);
-  if (FoundQueue == ChipQueues_.end()) {
+      std::find(UserQueues_.begin(), UserQueues_.end(), ChipQueue);
+  if (FoundQueue == UserQueues_.end()) {
     std::string Msg =
         "Tried to remove a queue for a device but the queue was not found in "
         "device queue list";
     CHIPERR_LOG_AND_THROW(Msg, hipErrorUnknown);
   }
-  ChipQueues_.erase(FoundQueue);
+  UserQueues_.erase(FoundQueue);
 
   delete ChipQueue;
   return true;
@@ -1492,13 +1491,23 @@ void chipstar::Queue::updateLastEvent(
   LastEvent_ = NewEvent;
 }
 
+/// Return a list of events from other queues that the current queue needs to
+/// synchronize with for modeling the implicit synchronization behavior of the
+/// NULL stream. Called queue's last event is included if 'IncludeSelfLastEvent'
+/// is true.
 std::pair<chipstar::SharedEventVector, chipstar::LockGuardVector>
-chipstar::Queue::getSyncQueuesLastEvents(
-    std::shared_ptr<chipstar::Event> Event) {
+chipstar::Queue::getSyncQueuesLastEvents(std::shared_ptr<chipstar::Event> Event,
+                                         bool IncludeSelfLastEvent) {
 
   std::vector<std::shared_ptr<chipstar::Event>> EventsToWaitOn;
   std::vector<std::unique_ptr<std::unique_lock<std::mutex>>> EventLocks;
 
+  // No need for default-stream implicit synchronization if there are
+  // no user created blocking queues.
+  auto NumUserQueues = ChipDevice_->getNumUserQueues();
+  if (!NumUserQueues && !IncludeSelfLastEvent)
+    return {EventsToWaitOn, std::move(EventLocks)};
+
   EventLocks.push_back(std::make_unique<std::unique_lock<std::mutex>>(
       ::Backend->GlobalLastEventMtx));
   EventLocks.push_back(
@@ -1517,6 +1526,9 @@ chipstar::Queue::getSyncQueuesLastEvents(
   EventLocks.push_back(
       std::make_unique<std::unique_lock<std::mutex>>(Event->EventMtx));
 
+  if (!NumUserQueues)
+    return {EventsToWaitOn, std::move(EventLocks)};
+
   // If this stream is default legacy stream, sync with all other streams on
   // this device
   if (this->isDefaultLegacyQueue() || this->isDefaultPerThreadQueue()) {
diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh
index 8ec383700..d0ecd4564 100644
--- a/src/CHIPBackend.hh
+++ b/src/CHIPBackend.hh
@@ -1310,7 +1310,10 @@ class Device {
 protected:
   std::string DeviceName_;
   chipstar::Context *Ctx_;
-  std::vector<chipstar::Queue *> ChipQueues_;
+
+  /// List of user created queues.
+  std::vector<chipstar::Queue *> UserQueues_;
+
   std::once_flag PropsPopulated_;
 
   hipDeviceAttribute_t Attrs_;
@@ -1341,7 +1344,10 @@ public:
   std::mutex DeviceMtx;
   std::mutex QueueAddRemoveMtx;
 
-  std::vector<chipstar::Queue *> getQueuesNoLock() { return ChipQueues_; }
+  std::vector<chipstar::Queue *> getQueuesNoLock() { return UserQueues_; }
+
+  /// Return the number of user created queues.
+  size_t getNumUserQueues() const noexcept { return UserQueues_.size(); }
 
   chipstar::Queue *LegacyDefaultQueue;
   inline static thread_local std::unique_ptr<chipstar::Queue>
@@ -2131,9 +2137,9 @@ public:
     isPerThreadDefaultQueue_ = Status;
   }
 
-  std::pair<SharedEventVector, LockGuardVector> getSyncQueuesLastEvents();
   std::pair<SharedEventVector, LockGuardVector>
-  getSyncQueuesLastEvents(std::shared_ptr<chipstar::Event> LastEvent);
+  getSyncQueuesLastEvents(std::shared_ptr<chipstar::Event> LastEvent,
+                          bool IncludeSelfLastEvent);
   enum MEM_MAP_TYPE { HOST_READ, HOST_WRITE, HOST_READ_WRITE };
   virtual void MemMap(const chipstar::AllocationInfo *AllocInfo,
                       MEM_MAP_TYPE MapType) {}
diff --git a/src/backend/Level0/CHIPBackendLevel0.cc b/src/backend/Level0/CHIPBackendLevel0.cc
index 448fe4227..786c649bf 100644
--- a/src/backend/Level0/CHIPBackendLevel0.cc
+++ b/src/backend/Level0/CHIPBackendLevel0.cc
@@ -830,7 +830,8 @@ CHIPQueueLevel0::~CHIPQueueLevel0() {
 std::pair<std::vector<ze_event_handle_t>, chipstar::LockGuardVector>
 CHIPQueueLevel0::addDependenciesQueueSync(
     std::shared_ptr<chipstar::Event> TargetEvent) {
-  auto [EventsToWaitOn, EventLocks] = getSyncQueuesLastEvents(TargetEvent);
+  auto [EventsToWaitOn, EventLocks] =
+      getSyncQueuesLastEvents(TargetEvent, true);
   for (auto &Event : EventsToWaitOn)
     Event->isDeletedSanityCheck();
 
diff --git a/src/backend/OpenCL/CHIPBackendOpenCL.cc b/src/backend/OpenCL/CHIPBackendOpenCL.cc
index 7513c166b..8e0445cad 100644
--- a/src/backend/OpenCL/CHIPBackendOpenCL.cc
+++ b/src/backend/OpenCL/CHIPBackendOpenCL.cc
@@ -1084,7 +1084,7 @@ void CHIPQueueOpenCL::MemMap(const chipstar::AllocationInfo *AllocInfo,
   auto MemMapEventNative =
       std::static_pointer_cast<CHIPEventOpenCL>(MemMapEvent)->getNativePtr();
 
-  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent);
+  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent, false);
   std::vector<cl_event> SyncQueuesEventHandles = getOpenCLHandles(EventsToWait);
 
   auto QueueHandle = get()->get();
@@ -1123,7 +1123,7 @@ void CHIPQueueOpenCL::MemUnmap(const chipstar::AllocationInfo *AllocInfo) {
     return;
   }
   logDebug("CHIPQueueOpenCL::MemUnmap");
-  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent);
+  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent, false);
   std::vector<cl_event> SyncQueuesEventHandles = getOpenCLHandles(EventsToWait);
 
   auto Status = clEnqueueSVMUnmap(
@@ -1217,7 +1217,7 @@ std::shared_ptr<chipstar::Event> CHIPQueueOpenCL::enqueueMarkerImpl() {
       static_cast<CHIPBackendOpenCL *>(Backend)->createEventShared(
           ChipContext_);
 
-  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MarkerEvent);
+  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MarkerEvent, false);
   std::vector<cl_event> SyncQueuesEventHandles = getOpenCLHandles(EventsToWait);
 
   auto Status = clEnqueueMarkerWithWaitList(
@@ -1263,7 +1263,7 @@ CHIPQueueOpenCL::launchImpl(chipstar::ExecItem *ExecItem) {
   auto AllocationsToKeepAlive = annotateIndirectPointers(
       *OclContext, Kernel->getModule()->getInfo(), KernelHandle);
 
-  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(LaunchEvent);
+  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(LaunchEvent, false);
   std::vector<cl_event> SyncQueuesEventHandles = getOpenCLHandles(EventsToWait);
 
   auto Status = clEnqueueNDRangeKernel(
@@ -1405,7 +1405,7 @@ CHIPQueueOpenCL::memCopyAsyncImpl(void *Dst, const void *Src, size_t Size) {
 #ifdef CHIP_DUBIOUS_LOCKS
     LOCK(Backend->DubiousLockOpenCL)
 #endif
-    auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event);
+      auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false);
     std::vector<cl_event> SyncQueuesEventHandles =
         getOpenCLHandles(EventsToWait);
 
@@ -1434,7 +1434,7 @@ CHIPQueueOpenCL::memFillAsyncImpl(void *Dst, size_t Size, const void *Pattern,
       static_cast<CHIPBackendOpenCL *>(Backend)->createEventShared(
           ChipContext_);
   logTrace("clSVMmemfill {} / {} B\n", Dst, Size);
-  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event);
+  auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false);
   std::vector<cl_event> SyncQueuesEventHandles = getOpenCLHandles(EventsToWait);
 
   int Retval = ::clEnqueueSVMMemFill(
@@ -1527,7 +1527,7 @@ std::shared_ptr<chipstar::Event> CHIPQueueOpenCL::enqueueBarrierImpl(
           std::static_pointer_cast<CHIPEventOpenCL>(WaitEvent)->getNativeRef());
     }
     // auto Status = ClQueue_->enqueueBarrierWithWaitList(&Events, &Barrier);
-    auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event);
+    auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false);
     std::vector<cl_event> SyncQueuesEventHandles =
         getOpenCLHandles(EventsToWait);
 
@@ -1541,7 +1541,7 @@ std::shared_ptr<chipstar::Event> CHIPQueueOpenCL::enqueueBarrierImpl(
     CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd);
   } else {
     // auto Status = ClQueue_->enqueueBarrierWithWaitList(nullptr, &Barrier);
-    auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event);
+    auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false);
     std::vector<cl_event> SyncQueuesEventHandles =
         getOpenCLHandles(EventsToWait);
 

From fed0f7cbd9a6a27b5948809e7cf937d42b4a5f79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Thu, 14 Mar 2024 16:46:22 +0200
Subject: [PATCH 4/6] SPIRVFuncInfo: eliminate map lookup

This saves some time.
---
 src/SPIRVFuncInfo.cc              | 24 ++----------------------
 src/SPIRVFuncInfo.hh              | 14 +++++---------
 src/spirv.cc                      | 10 +++++++---
 tests/runtime/TestArgVisitors.cpp | 21 ++++++++++++++++-----
 4 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/src/SPIRVFuncInfo.cc b/src/SPIRVFuncInfo.cc
index 36e7394ed..445e2935a 100644
--- a/src/SPIRVFuncInfo.cc
+++ b/src/SPIRVFuncInfo.cc
@@ -84,16 +84,6 @@ std::string_view SPVFuncInfo::Arg::getKindAsString() const {
   }
 }
 
-bool SPVFuncInfo::isSpilledArg(unsigned KernelArgIndex) const {
-  return SpilledArgs_.count(KernelArgIndex);
-}
-
-unsigned SPVFuncInfo::getSpilledArgSize(unsigned KernelArgIndex) const {
-  assert(isSpilledArg(KernelArgIndex));
-  assert(KernelArgIndex < (1u << 16u) && "Definitely out-of-bounds");
-  return SpilledArgs_.at(static_cast<uint16_t>(KernelArgIndex));
-}
-
 /// Client side kernel argument visitor.
 void SPVFuncInfo::visitClientArgsImpl(const std::vector<void *> &ClientArgList,
                                       ClientArgVisitor Visitor) const {
@@ -105,11 +95,9 @@ void SPVFuncInfo::visitClientArgsImpl(const std::vector<void *> &ClientArgList,
     auto ArgKind = ArgTI.Kind;
     auto ArgSize = ArgTI.Size;
 
-    if (isSpilledArg(KernelArgIdx)) {
-      assert(ArgKind != SPVTypeKind::Image && ArgKind != SPVTypeKind::Sampler &&
-             "Impossible arg kind to spill!");
+    if (ArgKind == SPVTypeKind::PODByRef) {
+      // Users' POV the kernel argument is passed by value.
       ArgKind = SPVTypeKind::POD;
-      ArgSize = getSpilledArgSize(KernelArgIdx);
     }
 
     // Additional argument created by the texture lowering pass.
@@ -160,14 +148,6 @@ void SPVFuncInfo::visitKernelArgsImpl(const std::vector<void *> &ClientArgList,
     auto ArgKind = ArgTI.Kind;
     auto ArgSize = ArgTI.Size;
 
-    if (isSpilledArg(ArgIndex)) {
-      assert(ArgKind != SPVTypeKind::Image && ArgKind != SPVTypeKind::Sampler &&
-             "Impossible arg kind to spill!");
-      ArgKind = SPVTypeKind::PODByRef;
-      ArgSize = getSpilledArgSize(ArgIndex);
-      logDebug("*** Arg {} spill size: {}", ArgIndex, ArgSize); // DEBUG
-    }
-
     // Sampler is additional argument generated by HipTextureLowering
     // pass and it appears after SPVTypeKind::Image argument. Pass
     // the same argument that was passed for the image argument
diff --git a/src/SPIRVFuncInfo.hh b/src/SPIRVFuncInfo.hh
index cca44a736..c62ec2a8f 100644
--- a/src/SPIRVFuncInfo.hh
+++ b/src/SPIRVFuncInfo.hh
@@ -39,9 +39,8 @@ enum class SPVTypeKind : unsigned {
   Pointer, // The type is a pointer of any storage class.
 
   // Kinds that may only appear in SPVFuncInfo::KernelArg.
-  PODByRef, // Same as PODB except the value is passed in an
-            // intermediate device buffer and a pointer to its
-            // location given to the kernel.
+  PODByRef, // Same as POD except the value is passed indirectly via
+            // a device buffer.
   Image,    // The type is a image.
   Sampler,  // The type is a sample.
 
@@ -75,9 +74,8 @@ class SPVFuncInfo {
 
   std::vector<SPVArgTypeInfo> ArgTypeInfo_;
 
-  /// Spilled argument annotations represented as pairs of argument
-  /// index (key) and argument size (value).
-  std::map<uint16_t, uint16_t> SpilledArgs_;
+  /// Set to true if any argument is SPVTypeKind::PODByRef.
+  bool HasByRefArgs_ = false;
 
 public:
   /// A structure for argument info passed by the visitor methods.
@@ -119,15 +117,13 @@ public:
   unsigned getNumKernelArgs() const { return ArgTypeInfo_.size(); }
 
   /// Return true is any argument is passed via intermediate buffer.
-  bool hasByRefArgs() const { return SpilledArgs_.size(); }
+  bool hasByRefArgs() const noexcept { return HasByRefArgs_; }
 
 private:
   void visitClientArgsImpl(const std::vector<void *> &ArgList,
                            ClientArgVisitor Fn) const;
   void visitKernelArgsImpl(const std::vector<void *> &ArgList,
                            KernelArgVisitor Fn) const;
-  bool isSpilledArg(unsigned KernelArgIndex) const;
-  unsigned getSpilledArgSize(unsigned KernelArgIndex) const;
 };
 
 typedef std::map<int32_t, std::shared_ptr<SPVFuncInfo>> SPVFuncInfoMap;
diff --git a/src/spirv.cc b/src/spirv.cc
index f838ccb2f..2b90bad59 100644
--- a/src/spirv.cc
+++ b/src/spirv.cc
@@ -579,9 +579,13 @@ class SPIRVmodule {
       assert(Fi != KernelInfoMap_.end());
       auto FnInfo = Fi->second;
 
-      if (SpilledArgAnnotations_.count(KernelName))
-        for (auto &Kv : SpilledArgAnnotations_[KernelName])
-          FnInfo->SpilledArgs_.insert(Kv);
+      if (SpilledArgAnnotations_.count(KernelName)) {
+        FnInfo->HasByRefArgs_ = true;
+        for (auto &Kv : SpilledArgAnnotations_[KernelName]) {
+          FnInfo->ArgTypeInfo_[Kv.first].Kind = SPVTypeKind::PODByRef;
+          FnInfo->ArgTypeInfo_[Kv.first].Size = Kv.second;
+        }
+      }
 
       ModuleInfo.FuncInfoMap.emplace(std::make_pair(i.second, FnInfo));
     }
diff --git a/tests/runtime/TestArgVisitors.cpp b/tests/runtime/TestArgVisitors.cpp
index e4e0ba37d..3a135b572 100644
--- a/tests/runtime/TestArgVisitors.cpp
+++ b/tests/runtime/TestArgVisitors.cpp
@@ -34,6 +34,12 @@ int main() {
   ArgInfo.emplace_back(
       SPVArgTypeInfo{SPVTypeKind::POD, SPVStorageClass::CrossWorkgroup, 16});
 
+  // Arg 5: Simulate PODByRef - a POD argument too large to fit in
+  // driver's argument buffer so it is instead passed indirectly via a
+  // device buffer.
+  ArgInfo.emplace_back(SPVArgTypeInfo{
+      SPVTypeKind::PODByRef, SPVStorageClass::CrossWorkgroup, 1 << 20});
+
   // Simulate dynamic shared memory pointer. HipDynMem.cpp inserts it
   // at the end of the paremeter list
   ArgInfo.emplace_back(
@@ -42,13 +48,13 @@ int main() {
   SPVFuncInfo FI(ArgInfo);
 
   // Simulate client-side arguments.
-  int a, b, c, d, e;
-  std::vector<void *> ArgList{&a, &b, &c, &d, &e};
+  int a, b, c, d, e, f;
+  std::vector<void *> ArgList{&a, &b, &c, &d, &e, &f};
 
   // Test visitors.
 
-  assert(FI.getNumClientArgs() == 5);
-  assert(FI.getNumKernelArgs() == 8);
+  assert(FI.getNumClientArgs() == 6);
+  assert(FI.getNumKernelArgs() == 9);
 
   unsigned ArgIdx = 0;
   FI.visitClientArgs([&](const SPVFuncInfo::ClientArg &Arg) {
@@ -74,7 +80,9 @@ int main() {
     else if (Arg.Index == 4)
       assert(Arg.Kind == SPVTypeKind::POD);
     // Skip workgroup pointer for dynamic shared pointer.
-    else {
+    else if (Arg.Index == 5) {
+      assert(Arg.Kind == SPVTypeKind::POD);
+    } else {
       assert(false && "Broken test.");
       exit(1);
     }
@@ -113,6 +121,9 @@ int main() {
       assert(Arg.Kind == SPVTypeKind::POD);
       assert(Arg.Data == ArgList.at(4));
     } else if (Arg.Index == 7) {
+      assert(Arg.Kind == SPVTypeKind::PODByRef);
+      assert(Arg.Data == ArgList.at(5));
+    } else if (Arg.Index == 8) {
       assert(Arg.Kind == SPVTypeKind::Pointer);
       assert(Arg.isWorkgroupPtr());
       assert(Arg.Data == nullptr);

From 02f7658bebd9c11182e50e643ad558639552f387 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Fri, 15 Mar 2024 14:00:35 +0200
Subject: [PATCH 5/6] OpenCL: remove an unused member variable.

---
 src/backend/OpenCL/CHIPBackendOpenCL.hh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/backend/OpenCL/CHIPBackendOpenCL.hh b/src/backend/OpenCL/CHIPBackendOpenCL.hh
index 8369eed8e..2be7849ed 100644
--- a/src/backend/OpenCL/CHIPBackendOpenCL.hh
+++ b/src/backend/OpenCL/CHIPBackendOpenCL.hh
@@ -433,7 +433,6 @@ public:
       : ExecItem(GirdDim, BlockDim, SharedMem, ChipQueue) {}
 
   virtual ~CHIPExecItemOpenCL() override {}
-  SPVFuncInfo FuncInfo;
   virtual void setupAllArgs() override;
   cl_kernel getKernelHandle();
 

From 6541c2acbf525a91cbe138d9bf7954d2072f72fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= <henry.mikael.linjamaki@intel.com>
Date: Tue, 12 Mar 2024 13:47:11 +0200
Subject: [PATCH 6/6] Eliminate redundant kernel argument copies

... within hipLaunchKernel() call.

Along the way fix CHIPGraphNodeKernel instances didn't copy kernel arguments
fully (they only copied pointers to arguments but not their values).
---
 src/CHIPBackend.cc                |  9 ++------
 src/CHIPBackend.hh                |  6 ++---
 src/CHIPGraph.cc                  | 18 ++++++++++-----
 src/CHIPGraph.hh                  |  6 +++++
 src/SPIRVFuncInfo.cc              | 22 ++++++++----------
 src/SPIRVFuncInfo.hh              | 12 ++++------
 src/Utils.cc                      | 38 +++++++++++++++++++++++++++++++
 src/Utils.hh                      |  3 +++
 tests/runtime/TestArgVisitors.cpp | 21 +++++++++--------
 9 files changed, 88 insertions(+), 47 deletions(-)

diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc
index e786bce90..7bfd5099f 100644
--- a/src/CHIPBackend.cc
+++ b/src/CHIPBackend.cc
@@ -34,7 +34,7 @@ static void queueKernel(chipstar::Queue *Q, chipstar::Kernel *K,
       ::Backend->createExecItem(GridDim, BlockDim, SharedMemSize, Q);
   EI->setKernel(K);
 
-  EI->copyArgs(Args);
+  EI->setArgs(Args);
   EI->setupAllArgs();
 
   auto ChipQueue = EI->getQueue();
@@ -497,11 +497,6 @@ void *chipstar::ArgSpillBuffer ::allocate(const SPVFuncInfo::Arg &Arg) {
 
 // ExecItem
 //*************************************************************************************
-void chipstar::ExecItem::copyArgs(void **Args) {
-  for (int i = 0; i < getNumArgs(); i++) {
-    Args_.push_back(Args[i]);
-  }
-}
 
 chipstar::ExecItem::ExecItem(dim3 GridDim, dim3 BlockDim, size_t SharedMem,
                              hipStream_t ChipQueue)
@@ -1890,7 +1885,7 @@ void chipstar::Queue::launchKernel(chipstar::Kernel *ChipKernel, dim3 NumBlocks,
   chipstar::ExecItem *ExItem =
       ::Backend->createExecItem(NumBlocks, DimBlocks, SharedMemBytes, this);
   ExItem->setKernel(ChipKernel);
-  ExItem->copyArgs(Args);
+  ExItem->setArgs(Args);
   ExItem->setupAllArgs();
   launch(ExItem);
   delete ExItem;
diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh
index d0ecd4564..a87568018 100644
--- a/src/CHIPBackend.hh
+++ b/src/CHIPBackend.hh
@@ -1177,12 +1177,12 @@ protected:
 
   chipstar::Queue *ChipQueue_;
 
-  std::vector<void *> Args_;
+  void **Args_;
 
   std::shared_ptr<chipstar::ArgSpillBuffer> ArgSpillBuffer_;
 
 public:
-  void copyArgs(void **Args);
+  void setArgs(void **Args) { Args_ = Args; }
   void setQueue(chipstar::Queue *Queue) { ChipQueue_ = Queue; }
   std::mutex ExecItemMtx;
   size_t getNumArgs() {
@@ -1194,7 +1194,7 @@ public:
   /**
    * @brief Return argument list.
    */
-  const std::vector<void *> &getArgs() const { return Args_; }
+  void **getArgs() const { return Args_; }
 
   /**
    * @brief Deleted default constructor
diff --git a/src/CHIPGraph.cc b/src/CHIPGraph.cc
index 34d158367..e6ea9c299 100644
--- a/src/CHIPGraph.cc
+++ b/src/CHIPGraph.cc
@@ -125,18 +125,22 @@ CHIPGraphNodeKernel::CHIPGraphNodeKernel(const hipKernelNodeParams *TheParams)
   Params_.extra = TheParams->extra;
   Params_.func = TheParams->func;
   Params_.gridDim = TheParams->gridDim;
-  Params_.kernelParams = TheParams->kernelParams;
   Params_.sharedMemBytes = TheParams->sharedMemBytes;
+
   auto Dev = Backend->getActiveDevice();
   chipstar::Kernel *ChipKernel = Dev->findKernel(HostPtr(Params_.func));
   if (!ChipKernel)
     CHIPERR_LOG_AND_THROW("Could not find requested kernel",
                           hipErrorInvalidDeviceFunction);
+
+  copyKernelArgs(ArgList_, ArgData_, TheParams->kernelParams,
+                 *ChipKernel->getFuncInfo());
+  Params_.kernelParams = ArgList_.data();
+
   ExecItem_ = Backend->createExecItem(Params_.gridDim, Params_.blockDim,
                                       Params_.sharedMemBytes, nullptr);
   ExecItem_->setKernel(ChipKernel);
-
-  ExecItem_->copyArgs(TheParams->kernelParams);
+  ExecItem_->setArgs(TheParams->kernelParams);
   ExecItem_->setupAllArgs();
 }
 
@@ -149,7 +153,6 @@ CHIPGraphNodeKernel::CHIPGraphNodeKernel(const void *HostFunction, dim3 GridDim,
   Params_.extra = nullptr;
   Params_.func = const_cast<void *>(HostFunction);
   Params_.gridDim = GridDim;
-  Params_.kernelParams = Args;
   Params_.sharedMemBytes = SharedMem;
 
   auto Dev = Backend->getActiveDevice();
@@ -157,10 +160,13 @@ CHIPGraphNodeKernel::CHIPGraphNodeKernel(const void *HostFunction, dim3 GridDim,
   if (!ChipKernel)
     CHIPERR_LOG_AND_THROW("Could not find requested kernel",
                           hipErrorInvalidDeviceFunction);
+
+  copyKernelArgs(ArgList_, ArgData_, Args, *ChipKernel->getFuncInfo());
+  Params_.kernelParams = ArgList_.data();
+
   ExecItem_ = Backend->createExecItem(GridDim, BlockDim, SharedMem, nullptr);
   ExecItem_->setKernel(ChipKernel);
-
-  ExecItem_->copyArgs(Args);
+  ExecItem_->setArgs(Params_.kernelParams);
   ExecItem_->setupAllArgs();
 }
 
diff --git a/src/CHIPGraph.hh b/src/CHIPGraph.hh
index 191d6b304..2c4370a4b 100644
--- a/src/CHIPGraph.hh
+++ b/src/CHIPGraph.hh
@@ -261,6 +261,12 @@ public:
 
 class CHIPGraphNodeKernel : public CHIPGraphNode {
 private:
+  /// A block holding the bytes of the kernel arguments.
+  std::vector<char> ArgData_;
+
+  /// pointer to start of the kernel argument data for each kernel argument.
+  std::vector<void *> ArgList_;
+
   hipKernelNodeParams Params_;
   chipstar::ExecItem *ExecItem_;
 
diff --git a/src/SPIRVFuncInfo.cc b/src/SPIRVFuncInfo.cc
index 445e2935a..50c89f054 100644
--- a/src/SPIRVFuncInfo.cc
+++ b/src/SPIRVFuncInfo.cc
@@ -85,7 +85,7 @@ std::string_view SPVFuncInfo::Arg::getKindAsString() const {
 }
 
 /// Client side kernel argument visitor.
-void SPVFuncInfo::visitClientArgsImpl(const std::vector<void *> &ClientArgList,
+void SPVFuncInfo::visitClientArgsImpl(void **ClientArgList,
                                       ClientArgVisitor Visitor) const {
 
   unsigned ArgListIndex = 0;
@@ -113,12 +113,10 @@ void SPVFuncInfo::visitClientArgsImpl(const std::vector<void *> &ClientArgList,
       // Image argument replaced hipTextureObject_t argument.
       ArgKind = SPVTypeKind::Pointer;
 
-    auto *ArgData =
-        ClientArgList.empty() ? nullptr : ClientArgList[ArgListIndex];
+    auto *ArgData = !ClientArgList ? nullptr : ClientArgList[ArgListIndex];
 
     // Clang generated argument list should not have nullptrs in it.
-    assert((ClientArgList.empty() || ArgData) &&
-           "nullptr in the argument list");
+    assert((!ClientArgList || ArgData) && "nullptr in the argument list");
 
     ClientArg CArg{
         {{ArgKind, ArgTI.StorageClass, ArgSize}, ArgListIndex, ArgData}};
@@ -128,19 +126,18 @@ void SPVFuncInfo::visitClientArgsImpl(const std::vector<void *> &ClientArgList,
 }
 
 /// Visit client-visible kernel arguments
-void SPVFuncInfo::visitClientArgs(const std::vector<void *> &ClientArgList,
+void SPVFuncInfo::visitClientArgs(void **ClientArgList,
                                   ClientArgVisitor Visitor) const {
-  assert(ClientArgList.size() == getNumClientArgs());
   visitClientArgsImpl(ClientArgList, Visitor);
 }
 
 /// Visit client-visible kernel arguments without the argument value
 /// (Arg::Data will be nullptr).
 void SPVFuncInfo::visitClientArgs(ClientArgVisitor Visitor) const {
-  visitClientArgsImpl(std::vector<void *>(), Visitor);
+  visitClientArgsImpl(nullptr, Visitor);
 }
 
-void SPVFuncInfo::visitKernelArgsImpl(const std::vector<void *> &ClientArgList,
+void SPVFuncInfo::visitKernelArgsImpl(void **ClientArgList,
                                       KernelArgVisitor Visitor) const {
   unsigned ArgIndex = 0;
   unsigned ArgListIndex = 0;
@@ -156,7 +153,7 @@ void SPVFuncInfo::visitKernelArgsImpl(const std::vector<void *> &ClientArgList,
       ArgListIndex--;
 
     const void *ArgData = nullptr;
-    if (!ClientArgList.empty() && !ArgTI.isWorkgroupPtr()) {
+    if (ClientArgList && !ArgTI.isWorkgroupPtr()) {
       ArgData = ClientArgList[ArgListIndex];
 
       // Clang geerated  argument list should not have nullptrs in it.
@@ -172,15 +169,14 @@ void SPVFuncInfo::visitKernelArgsImpl(const std::vector<void *> &ClientArgList,
 }
 
 // Visit kernel arguments
-void SPVFuncInfo::visitKernelArgs(const std::vector<void *> &ClientArgList,
+void SPVFuncInfo::visitKernelArgs(void **ClientArgList,
                                   KernelArgVisitor Visitor) const {
-  assert(ClientArgList.size() == getNumClientArgs());
   visitKernelArgsImpl(ClientArgList, Visitor);
 }
 
 /// Visit kernel arguments without argument list (Arg::Data will be nullptr)
 void SPVFuncInfo::visitKernelArgs(KernelArgVisitor Visitor) const {
-  visitKernelArgsImpl(std::vector<void *>(), Visitor);
+  visitKernelArgsImpl(nullptr, Visitor);
 }
 
 /// Return HIP user visible kernel argument count.
diff --git a/src/SPIRVFuncInfo.hh b/src/SPIRVFuncInfo.hh
index c62ec2a8f..c6883dd7d 100644
--- a/src/SPIRVFuncInfo.hh
+++ b/src/SPIRVFuncInfo.hh
@@ -101,11 +101,9 @@ public:
   SPVFuncInfo(const std::vector<SPVArgTypeInfo> &Info) : ArgTypeInfo_(Info) {}
 
   void visitClientArgs(ClientArgVisitor Fn) const;
-  void visitClientArgs(const std::vector<void *> &ArgList,
-                       ClientArgVisitor Fn) const;
+  void visitClientArgs(void **ArgList, ClientArgVisitor Fn) const;
   void visitKernelArgs(KernelArgVisitor Fn) const;
-  void visitKernelArgs(const std::vector<void *> &ArgList,
-                       KernelArgVisitor Fn) const;
+  void visitKernelArgs(void **ArgList, KernelArgVisitor Fn) const;
 
   /// Return visible kernel argument count.
   ///
@@ -120,10 +118,8 @@ public:
   bool hasByRefArgs() const noexcept { return HasByRefArgs_; }
 
 private:
-  void visitClientArgsImpl(const std::vector<void *> &ArgList,
-                           ClientArgVisitor Fn) const;
-  void visitKernelArgsImpl(const std::vector<void *> &ArgList,
-                           KernelArgVisitor Fn) const;
+  void visitClientArgsImpl(void **ArgList, ClientArgVisitor Fn) const;
+  void visitKernelArgsImpl(void **ArgList, KernelArgVisitor Fn) const;
 };
 
 typedef std::map<int32_t, std::shared_ptr<SPVFuncInfo>> SPVFuncInfoMap;
diff --git a/src/Utils.cc b/src/Utils.cc
index 6d771f24b..43f7a7f30 100644
--- a/src/Utils.cc
+++ b/src/Utils.cc
@@ -259,3 +259,41 @@ bool startsWith(std::string_view Str, std::string_view WithStr) {
   return Str.size() >= WithStr.size() &&
          Str.substr(0, WithStr.size()) == WithStr;
 }
+
+/// Deep copies kernel arguments pointed by 'CopyArg'. Bytes of the
+/// argument values are stored in 'ArgData'. 'ArgList[I]' points to
+/// the argument value in 'ArgData' for Ith kernel argument.
+void copyKernelArgs(std::vector<void *> &ArgList, std::vector<char> &ArgData,
+                    void **CopyFrom, const SPVFuncInfo &FuncInfo) {
+
+  ArgList.clear();
+  ArgData.clear();
+
+  std::vector<size_t> Offsets;
+  size_t CurrOffset = 0;
+
+  auto CopyArgData = [&](const SPVFuncInfo::ClientArg &Arg) {
+    assert((Arg.Kind == SPVTypeKind::POD || Arg.Kind == SPVTypeKind::Pointer) &&
+           "Unexpected argument kind.");
+
+    size_t Size = Arg.Size;
+    size_t Alignment = roundUpToPowerOfTwo(Size);
+    assert(Size && Alignment && "Invalid arg size or alignment!");
+
+    CurrOffset = roundUp(CurrOffset, Alignment);
+    logDebug("arg {} tgt offset: {}", Arg.Index, CurrOffset);
+    Offsets.push_back(CurrOffset);
+    assert(CurrOffset >= ArgData.size());
+
+    ArgData.resize(CurrOffset + Size, 0);
+    std::memcpy(ArgData.data() + CurrOffset, Arg.Data, Size);
+
+    CurrOffset += Size;
+  };
+  FuncInfo.visitClientArgs(CopyFrom, CopyArgData);
+
+  ArgList.reserve(Offsets.size());
+  char *BasePtr = ArgData.data();
+  for (auto Offset : Offsets)
+    ArgList.push_back(static_cast<void *>(BasePtr + Offset));
+}
diff --git a/src/Utils.hh b/src/Utils.hh
index 0c32676de..a090f1829 100644
--- a/src/Utils.hh
+++ b/src/Utils.hh
@@ -144,4 +144,7 @@ template <class T> struct PointerCmp {
   }
 };
 
+void copyKernelArgs(std::vector<void *> &ArgList, std::vector<char> &ArgData,
+                    void **CopyFrom, const SPVFuncInfo &FuncInfo);
+
 #endif
diff --git a/tests/runtime/TestArgVisitors.cpp b/tests/runtime/TestArgVisitors.cpp
index 3a135b572..eb5a16a65 100644
--- a/tests/runtime/TestArgVisitors.cpp
+++ b/tests/runtime/TestArgVisitors.cpp
@@ -49,7 +49,8 @@ int main() {
 
   // Simulate client-side arguments.
   int a, b, c, d, e, f;
-  std::vector<void *> ArgList{&a, &b, &c, &d, &e, &f};
+  std::vector<void *> ArgListVec{&a, &b, &c, &d, &e, &f};
+  void **ArgList = static_cast<void **>(ArgListVec.data());
 
   // Test visitors.
 
@@ -66,7 +67,7 @@ int main() {
   ArgIdx = 0;
   FI.visitClientArgs(ArgList, [&](const SPVFuncInfo::ClientArg &Arg) {
     assert(Arg.Index == ArgIdx++);
-    assert(Arg.Data == ArgList.at(Arg.Index));
+    assert(Arg.Data == ArgListVec.at(Arg.Index));
     if (Arg.Index == 0)
       assert(Arg.Kind == SPVTypeKind::Pointer);
     else if (Arg.Index == 1)
@@ -101,28 +102,28 @@ int main() {
 
     if (Arg.Index == 0) {
       assert(Arg.Kind == SPVTypeKind::Pointer);
-      assert(Arg.Data == ArgList.at(0));
+      assert(Arg.Data == ArgListVec.at(0));
     } else if (Arg.Index == 1) {
       assert(Arg.Kind == SPVTypeKind::Image);
-      assert(Arg.Data == ArgList.at(1));
+      assert(Arg.Data == ArgListVec.at(1));
     } else if (Arg.Index == 2) {
       assert(Arg.Kind == SPVTypeKind::Sampler);
-      assert(Arg.Data == ArgList.at(1));
+      assert(Arg.Data == ArgListVec.at(1));
     } else if (Arg.Index == 3) {
       assert(Arg.Kind == SPVTypeKind::POD);
-      assert(Arg.Data == ArgList.at(2));
+      assert(Arg.Data == ArgListVec.at(2));
     } else if (Arg.Index == 4) {
       assert(Arg.Kind == SPVTypeKind::Image);
-      assert(Arg.Data == ArgList.at(3));
+      assert(Arg.Data == ArgListVec.at(3));
     } else if (Arg.Index == 5) {
       assert(Arg.Kind == SPVTypeKind::Sampler);
-      assert(Arg.Data == ArgList.at(3));
+      assert(Arg.Data == ArgListVec.at(3));
     } else if (Arg.Index == 6) {
       assert(Arg.Kind == SPVTypeKind::POD);
-      assert(Arg.Data == ArgList.at(4));
+      assert(Arg.Data == ArgListVec.at(4));
     } else if (Arg.Index == 7) {
       assert(Arg.Kind == SPVTypeKind::PODByRef);
-      assert(Arg.Data == ArgList.at(5));
+      assert(Arg.Data == ArgListVec.at(5));
     } else if (Arg.Index == 8) {
       assert(Arg.Kind == SPVTypeKind::Pointer);
       assert(Arg.isWorkgroupPtr());