From 173420d3c3d4ecc82e0fa4058297230824e15af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Tue, 12 Mar 2024 12:09:32 +0200 Subject: [PATCH 1/6] Enable LTO / IPO on the runtime library Enable link-time / interprocedural optimizations on the runtime library if available on the system. --- CMakeLists.txt | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 289557420..3b98bb31b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -320,6 +320,18 @@ else() set(CHIP_LIB_NAME "libCHIP.a") endif() +include(CheckIPOSupported) +check_ipo_supported(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR) +message(STATUS "Interprocedural optimizations (IPO): ${SUPPORTS_IPO}.") +if(SUPPORTS_IPO) + set_property(TARGET CHIP PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE) +else() + message(NOTICE "To enable IPO with clang you may need to re-configure \ +LLVM/Clang with -DLLVM_BINUTILS_INCDIR for locating plugin-api.h from \ +binutils-dev package.") + message(STATUS "IPO error: ${IPO_ERROR}") +endif() + set(CHIP_INTERFACE_LIBS ${PTHREAD_LIBRARY}) if(OpenCL_LIBRARY) From 29599401bcc7dcc6baf5291ce67f4b99113c7aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Tue, 12 Mar 2024 14:21:43 +0200 Subject: [PATCH 2/6] Add early exit from registeredVarCopy() Exit early if there are no hipMemoryTypeManaged and hipMemoryTypeHost allocations in the current context. --- src/CHIPBackend.cc | 9 +++++++-- src/CHIPBackend.hh | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc index 591f6c5ca..5be039de3 100644 --- a/src/CHIPBackend.cc +++ b/src/CHIPBackend.cc @@ -167,6 +167,7 @@ void chipstar::AllocationTracker::recordAllocation( // Map onto host so that the data can be potentially initialized on host ::Backend->getActiveDevice()->getDefaultQueue()->MemMap( AllocInfo, chipstar::Queue::MEM_MAP_TYPE::HOST_WRITE); + NumHostAllocations_ += 1; } if (MemoryType == hipMemoryTypeUnified) @@ -1758,9 +1759,13 @@ chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem, // the kernel does not have any, we only need inspect kernels // pointer arguments for allocations to be synchronized. - std::vector> CopyEvents; + auto *AllocTracker = ::Backend->getActiveDevice()->AllocTracker; + if (!AllocTracker->getNumHostAllocations() && + !AllocTracker->getNumManagedAllocations()) + return nullptr; // Nothing to synchronize. + auto PreKernel = ExecState == MANAGED_MEM_STATE::PRE_KERNEL; - auto &AllocTracker = ::Backend->getActiveDevice()->AllocTracker; + std::vector> CopyEvents; auto ArgVisitor = [&](const chipstar::AllocationInfo &AllocInfo) -> void { if (AllocInfo.MemoryType == hipMemoryTypeHost) { logDebug("Sync host memory {} ({})", AllocInfo.HostPtr, diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh index ce92533a5..8ec383700 100644 --- a/src/CHIPBackend.hh +++ b/src/CHIPBackend.hh @@ -500,6 +500,9 @@ private: std::unordered_set AllocInfos_; std::unordered_map PtrToAllocInfo_; + size_t NumHostAllocations_ = 0; + size_t NumManagedAllocations_ = 0; + public: mutable std::mutex AllocationTrackerMtx; @@ -522,6 +525,7 @@ public: this->PtrToAllocInfo_[HostPtr] = AllocInfo; AllocInfo->MemoryType = hipMemoryTypeManaged; AllocInfo->IsHostRegistered = true; + NumManagedAllocations_ += 1; } size_t GlobalMemSize, TotalMemSize, MaxMemUsed; @@ -604,6 +608,20 @@ public: assert(AllocInfo && "Null pointer passed to eraseRecord"); assert(AllocInfos_.count(AllocInfo) && "Not a member of the allocation tracker!"); + + switch (AllocInfo->MemoryType) { + default: + break; + case hipMemoryTypeHost: + assert(NumHostAllocations_ > 0); + NumHostAllocations_ -= 1; + break; + case hipMemoryTypeManaged: + assert(NumManagedAllocations_ > 0); + NumManagedAllocations_ -= 1; + break; + } + PtrToAllocInfo_.erase(AllocInfo->DevPtr); if (AllocInfo->HostPtr) PtrToAllocInfo_.erase(AllocInfo->HostPtr); @@ -623,6 +641,10 @@ public: } size_t getNumAllocations() const { return AllocInfos_.size(); } + + // Return the number of host type allocations. + size_t getNumHostAllocations() const { return NumHostAllocations_; } + size_t getNumManagedAllocations() const { return NumManagedAllocations_; } }; class DeviceVar { From 9f578f63dcf2493bcd807fffae300bbab93bad64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Thu, 14 Mar 2024 15:23:09 +0200 Subject: [PATCH 3/6] Early exit from getSyncQueuesLastEvents() --- src/CHIPBackend.cc | 38 ++++++++++++++++--------- src/CHIPBackend.hh | 14 ++++++--- src/backend/Level0/CHIPBackendLevel0.cc | 3 +- src/backend/OpenCL/CHIPBackendOpenCL.cc | 16 +++++------ 4 files changed, 45 insertions(+), 26 deletions(-) diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc index 5be039de3..e786bce90 100644 --- a/src/CHIPBackend.cc +++ b/src/CHIPBackend.cc @@ -532,10 +532,9 @@ chipstar::Device::~Device() { if (PerThreadDefaultQueue) PerThreadDefaultQueue->finish(); - while (this->ChipQueues_.size() > 0) { - delete ChipQueues_[0]; - ChipQueues_.erase(ChipQueues_.begin()); - } + for (auto *Queue : UserQueues_) + delete Queue; + UserQueues_.clear(); delete LegacyDefaultQueue; LegacyDefaultQueue = nullptr; @@ -849,9 +848,9 @@ void chipstar::Device::addQueue(chipstar::Queue *ChipQueue) { logDebug("{} Device::addQueue({})", (void *)this, (void *)ChipQueue); auto QueueFound = - std::find(ChipQueues_.begin(), ChipQueues_.end(), ChipQueue); - if (QueueFound == ChipQueues_.end()) { - ChipQueues_.push_back(ChipQueue); + std::find(UserQueues_.begin(), UserQueues_.end(), ChipQueue); + if (QueueFound == UserQueues_.end()) { + UserQueues_.push_back(ChipQueue); } else { CHIPERR_LOG_AND_THROW("Tried to add a queue to the backend which was " "already present in the backend queue list", @@ -919,19 +918,19 @@ bool chipstar::Device::removeQueue(chipstar::Queue *ChipQueue) { * * Choosing not to call Queue->finish() */ - LOCK(DeviceMtx) // reading chipstar::Device::ChipQueues_ + LOCK(DeviceMtx) // reading chipstar::Device::UserQueues_ ChipQueue->updateLastEvent(nullptr); // Remove from device queue list auto FoundQueue = - std::find(ChipQueues_.begin(), ChipQueues_.end(), ChipQueue); - if (FoundQueue == ChipQueues_.end()) { + std::find(UserQueues_.begin(), UserQueues_.end(), ChipQueue); + if (FoundQueue == UserQueues_.end()) { std::string Msg = "Tried to remove a queue for a device but the queue was not found in " "device queue list"; CHIPERR_LOG_AND_THROW(Msg, hipErrorUnknown); } - ChipQueues_.erase(FoundQueue); + UserQueues_.erase(FoundQueue); delete ChipQueue; return true; @@ -1492,13 +1491,23 @@ void chipstar::Queue::updateLastEvent( LastEvent_ = NewEvent; } +/// Return a list of events from other queues that the current queue needs to +/// synchronize with for modeling the implicit synchronization behavior of the +/// NULL stream. Called queue's last event is included if 'IncludeSelfLastEvent' +/// is true. std::pair -chipstar::Queue::getSyncQueuesLastEvents( - std::shared_ptr Event) { +chipstar::Queue::getSyncQueuesLastEvents(std::shared_ptr Event, + bool IncludeSelfLastEvent) { std::vector> EventsToWaitOn; std::vector>> EventLocks; + // No need for default-stream implicit synchronization if there are + // no user created blocking queues. + auto NumUserQueues = ChipDevice_->getNumUserQueues(); + if (!NumUserQueues && !IncludeSelfLastEvent) + return {EventsToWaitOn, std::move(EventLocks)}; + EventLocks.push_back(std::make_unique>( ::Backend->GlobalLastEventMtx)); EventLocks.push_back( @@ -1517,6 +1526,9 @@ chipstar::Queue::getSyncQueuesLastEvents( EventLocks.push_back( std::make_unique>(Event->EventMtx)); + if (!NumUserQueues) + return {EventsToWaitOn, std::move(EventLocks)}; + // If this stream is default legacy stream, sync with all other streams on // this device if (this->isDefaultLegacyQueue() || this->isDefaultPerThreadQueue()) { diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh index 8ec383700..d0ecd4564 100644 --- a/src/CHIPBackend.hh +++ b/src/CHIPBackend.hh @@ -1310,7 +1310,10 @@ class Device { protected: std::string DeviceName_; chipstar::Context *Ctx_; - std::vector ChipQueues_; + + /// List of user created queues. + std::vector UserQueues_; + std::once_flag PropsPopulated_; hipDeviceAttribute_t Attrs_; @@ -1341,7 +1344,10 @@ public: std::mutex DeviceMtx; std::mutex QueueAddRemoveMtx; - std::vector getQueuesNoLock() { return ChipQueues_; } + std::vector getQueuesNoLock() { return UserQueues_; } + + /// Return the number of user created queues. + size_t getNumUserQueues() const noexcept { return UserQueues_.size(); } chipstar::Queue *LegacyDefaultQueue; inline static thread_local std::unique_ptr @@ -2131,9 +2137,9 @@ public: isPerThreadDefaultQueue_ = Status; } - std::pair getSyncQueuesLastEvents(); std::pair - getSyncQueuesLastEvents(std::shared_ptr LastEvent); + getSyncQueuesLastEvents(std::shared_ptr LastEvent, + bool IncludeSelfLastEvent); enum MEM_MAP_TYPE { HOST_READ, HOST_WRITE, HOST_READ_WRITE }; virtual void MemMap(const chipstar::AllocationInfo *AllocInfo, MEM_MAP_TYPE MapType) {} diff --git a/src/backend/Level0/CHIPBackendLevel0.cc b/src/backend/Level0/CHIPBackendLevel0.cc index 448fe4227..786c649bf 100644 --- a/src/backend/Level0/CHIPBackendLevel0.cc +++ b/src/backend/Level0/CHIPBackendLevel0.cc @@ -830,7 +830,8 @@ CHIPQueueLevel0::~CHIPQueueLevel0() { std::pair, chipstar::LockGuardVector> CHIPQueueLevel0::addDependenciesQueueSync( std::shared_ptr TargetEvent) { - auto [EventsToWaitOn, EventLocks] = getSyncQueuesLastEvents(TargetEvent); + auto [EventsToWaitOn, EventLocks] = + getSyncQueuesLastEvents(TargetEvent, true); for (auto &Event : EventsToWaitOn) Event->isDeletedSanityCheck(); diff --git a/src/backend/OpenCL/CHIPBackendOpenCL.cc b/src/backend/OpenCL/CHIPBackendOpenCL.cc index 7513c166b..8e0445cad 100644 --- a/src/backend/OpenCL/CHIPBackendOpenCL.cc +++ b/src/backend/OpenCL/CHIPBackendOpenCL.cc @@ -1084,7 +1084,7 @@ void CHIPQueueOpenCL::MemMap(const chipstar::AllocationInfo *AllocInfo, auto MemMapEventNative = std::static_pointer_cast(MemMapEvent)->getNativePtr(); - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); auto QueueHandle = get()->get(); @@ -1123,7 +1123,7 @@ void CHIPQueueOpenCL::MemUnmap(const chipstar::AllocationInfo *AllocInfo) { return; } logDebug("CHIPQueueOpenCL::MemUnmap"); - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MemMapEvent, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); auto Status = clEnqueueSVMUnmap( @@ -1217,7 +1217,7 @@ std::shared_ptr CHIPQueueOpenCL::enqueueMarkerImpl() { static_cast(Backend)->createEventShared( ChipContext_); - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MarkerEvent); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(MarkerEvent, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); auto Status = clEnqueueMarkerWithWaitList( @@ -1263,7 +1263,7 @@ CHIPQueueOpenCL::launchImpl(chipstar::ExecItem *ExecItem) { auto AllocationsToKeepAlive = annotateIndirectPointers( *OclContext, Kernel->getModule()->getInfo(), KernelHandle); - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(LaunchEvent); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(LaunchEvent, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); auto Status = clEnqueueNDRangeKernel( @@ -1405,7 +1405,7 @@ CHIPQueueOpenCL::memCopyAsyncImpl(void *Dst, const void *Src, size_t Size) { #ifdef CHIP_DUBIOUS_LOCKS LOCK(Backend->DubiousLockOpenCL) #endif - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); @@ -1434,7 +1434,7 @@ CHIPQueueOpenCL::memFillAsyncImpl(void *Dst, size_t Size, const void *Pattern, static_cast(Backend)->createEventShared( ChipContext_); logTrace("clSVMmemfill {} / {} B\n", Dst, Size); - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); int Retval = ::clEnqueueSVMMemFill( @@ -1527,7 +1527,7 @@ std::shared_ptr CHIPQueueOpenCL::enqueueBarrierImpl( std::static_pointer_cast(WaitEvent)->getNativeRef()); } // auto Status = ClQueue_->enqueueBarrierWithWaitList(&Events, &Barrier); - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); @@ -1541,7 +1541,7 @@ std::shared_ptr CHIPQueueOpenCL::enqueueBarrierImpl( CHIPERR_CHECK_LOG_AND_THROW(Status, CL_SUCCESS, hipErrorTbd); } else { // auto Status = ClQueue_->enqueueBarrierWithWaitList(nullptr, &Barrier); - auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event); + auto [EventsToWait, EventLocks] = getSyncQueuesLastEvents(Event, false); std::vector SyncQueuesEventHandles = getOpenCLHandles(EventsToWait); From fed0f7cbd9a6a27b5948809e7cf937d42b4a5f79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Thu, 14 Mar 2024 16:46:22 +0200 Subject: [PATCH 4/6] SPIRVFuncInfo: eliminate map lookup This saves some time. --- src/SPIRVFuncInfo.cc | 24 ++---------------------- src/SPIRVFuncInfo.hh | 14 +++++--------- src/spirv.cc | 10 +++++++--- tests/runtime/TestArgVisitors.cpp | 21 ++++++++++++++++----- 4 files changed, 30 insertions(+), 39 deletions(-) diff --git a/src/SPIRVFuncInfo.cc b/src/SPIRVFuncInfo.cc index 36e7394ed..445e2935a 100644 --- a/src/SPIRVFuncInfo.cc +++ b/src/SPIRVFuncInfo.cc @@ -84,16 +84,6 @@ std::string_view SPVFuncInfo::Arg::getKindAsString() const { } } -bool SPVFuncInfo::isSpilledArg(unsigned KernelArgIndex) const { - return SpilledArgs_.count(KernelArgIndex); -} - -unsigned SPVFuncInfo::getSpilledArgSize(unsigned KernelArgIndex) const { - assert(isSpilledArg(KernelArgIndex)); - assert(KernelArgIndex < (1u << 16u) && "Definitely out-of-bounds"); - return SpilledArgs_.at(static_cast(KernelArgIndex)); -} - /// Client side kernel argument visitor. void SPVFuncInfo::visitClientArgsImpl(const std::vector &ClientArgList, ClientArgVisitor Visitor) const { @@ -105,11 +95,9 @@ void SPVFuncInfo::visitClientArgsImpl(const std::vector &ClientArgList, auto ArgKind = ArgTI.Kind; auto ArgSize = ArgTI.Size; - if (isSpilledArg(KernelArgIdx)) { - assert(ArgKind != SPVTypeKind::Image && ArgKind != SPVTypeKind::Sampler && - "Impossible arg kind to spill!"); + if (ArgKind == SPVTypeKind::PODByRef) { + // Users' POV the kernel argument is passed by value. ArgKind = SPVTypeKind::POD; - ArgSize = getSpilledArgSize(KernelArgIdx); } // Additional argument created by the texture lowering pass. @@ -160,14 +148,6 @@ void SPVFuncInfo::visitKernelArgsImpl(const std::vector &ClientArgList, auto ArgKind = ArgTI.Kind; auto ArgSize = ArgTI.Size; - if (isSpilledArg(ArgIndex)) { - assert(ArgKind != SPVTypeKind::Image && ArgKind != SPVTypeKind::Sampler && - "Impossible arg kind to spill!"); - ArgKind = SPVTypeKind::PODByRef; - ArgSize = getSpilledArgSize(ArgIndex); - logDebug("*** Arg {} spill size: {}", ArgIndex, ArgSize); // DEBUG - } - // Sampler is additional argument generated by HipTextureLowering // pass and it appears after SPVTypeKind::Image argument. Pass // the same argument that was passed for the image argument diff --git a/src/SPIRVFuncInfo.hh b/src/SPIRVFuncInfo.hh index cca44a736..c62ec2a8f 100644 --- a/src/SPIRVFuncInfo.hh +++ b/src/SPIRVFuncInfo.hh @@ -39,9 +39,8 @@ enum class SPVTypeKind : unsigned { Pointer, // The type is a pointer of any storage class. // Kinds that may only appear in SPVFuncInfo::KernelArg. - PODByRef, // Same as PODB except the value is passed in an - // intermediate device buffer and a pointer to its - // location given to the kernel. + PODByRef, // Same as POD except the value is passed indirectly via + // a device buffer. Image, // The type is a image. Sampler, // The type is a sample. @@ -75,9 +74,8 @@ class SPVFuncInfo { std::vector ArgTypeInfo_; - /// Spilled argument annotations represented as pairs of argument - /// index (key) and argument size (value). - std::map SpilledArgs_; + /// Set to true if any argument is SPVTypeKind::PODByRef. + bool HasByRefArgs_ = false; public: /// A structure for argument info passed by the visitor methods. @@ -119,15 +117,13 @@ public: unsigned getNumKernelArgs() const { return ArgTypeInfo_.size(); } /// Return true is any argument is passed via intermediate buffer. - bool hasByRefArgs() const { return SpilledArgs_.size(); } + bool hasByRefArgs() const noexcept { return HasByRefArgs_; } private: void visitClientArgsImpl(const std::vector &ArgList, ClientArgVisitor Fn) const; void visitKernelArgsImpl(const std::vector &ArgList, KernelArgVisitor Fn) const; - bool isSpilledArg(unsigned KernelArgIndex) const; - unsigned getSpilledArgSize(unsigned KernelArgIndex) const; }; typedef std::map> SPVFuncInfoMap; diff --git a/src/spirv.cc b/src/spirv.cc index f838ccb2f..2b90bad59 100644 --- a/src/spirv.cc +++ b/src/spirv.cc @@ -579,9 +579,13 @@ class SPIRVmodule { assert(Fi != KernelInfoMap_.end()); auto FnInfo = Fi->second; - if (SpilledArgAnnotations_.count(KernelName)) - for (auto &Kv : SpilledArgAnnotations_[KernelName]) - FnInfo->SpilledArgs_.insert(Kv); + if (SpilledArgAnnotations_.count(KernelName)) { + FnInfo->HasByRefArgs_ = true; + for (auto &Kv : SpilledArgAnnotations_[KernelName]) { + FnInfo->ArgTypeInfo_[Kv.first].Kind = SPVTypeKind::PODByRef; + FnInfo->ArgTypeInfo_[Kv.first].Size = Kv.second; + } + } ModuleInfo.FuncInfoMap.emplace(std::make_pair(i.second, FnInfo)); } diff --git a/tests/runtime/TestArgVisitors.cpp b/tests/runtime/TestArgVisitors.cpp index e4e0ba37d..3a135b572 100644 --- a/tests/runtime/TestArgVisitors.cpp +++ b/tests/runtime/TestArgVisitors.cpp @@ -34,6 +34,12 @@ int main() { ArgInfo.emplace_back( SPVArgTypeInfo{SPVTypeKind::POD, SPVStorageClass::CrossWorkgroup, 16}); + // Arg 5: Simulate PODByRef - a POD argument too large to fit in + // driver's argument buffer so it is instead passed indirectly via a + // device buffer. + ArgInfo.emplace_back(SPVArgTypeInfo{ + SPVTypeKind::PODByRef, SPVStorageClass::CrossWorkgroup, 1 << 20}); + // Simulate dynamic shared memory pointer. HipDynMem.cpp inserts it // at the end of the paremeter list ArgInfo.emplace_back( @@ -42,13 +48,13 @@ int main() { SPVFuncInfo FI(ArgInfo); // Simulate client-side arguments. - int a, b, c, d, e; - std::vector ArgList{&a, &b, &c, &d, &e}; + int a, b, c, d, e, f; + std::vector ArgList{&a, &b, &c, &d, &e, &f}; // Test visitors. - assert(FI.getNumClientArgs() == 5); - assert(FI.getNumKernelArgs() == 8); + assert(FI.getNumClientArgs() == 6); + assert(FI.getNumKernelArgs() == 9); unsigned ArgIdx = 0; FI.visitClientArgs([&](const SPVFuncInfo::ClientArg &Arg) { @@ -74,7 +80,9 @@ int main() { else if (Arg.Index == 4) assert(Arg.Kind == SPVTypeKind::POD); // Skip workgroup pointer for dynamic shared pointer. - else { + else if (Arg.Index == 5) { + assert(Arg.Kind == SPVTypeKind::POD); + } else { assert(false && "Broken test."); exit(1); } @@ -113,6 +121,9 @@ int main() { assert(Arg.Kind == SPVTypeKind::POD); assert(Arg.Data == ArgList.at(4)); } else if (Arg.Index == 7) { + assert(Arg.Kind == SPVTypeKind::PODByRef); + assert(Arg.Data == ArgList.at(5)); + } else if (Arg.Index == 8) { assert(Arg.Kind == SPVTypeKind::Pointer); assert(Arg.isWorkgroupPtr()); assert(Arg.Data == nullptr); From 02f7658bebd9c11182e50e643ad558639552f387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Fri, 15 Mar 2024 14:00:35 +0200 Subject: [PATCH 5/6] OpenCL: remove an unused member variable. --- src/backend/OpenCL/CHIPBackendOpenCL.hh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/backend/OpenCL/CHIPBackendOpenCL.hh b/src/backend/OpenCL/CHIPBackendOpenCL.hh index 8369eed8e..2be7849ed 100644 --- a/src/backend/OpenCL/CHIPBackendOpenCL.hh +++ b/src/backend/OpenCL/CHIPBackendOpenCL.hh @@ -433,7 +433,6 @@ public: : ExecItem(GirdDim, BlockDim, SharedMem, ChipQueue) {} virtual ~CHIPExecItemOpenCL() override {} - SPVFuncInfo FuncInfo; virtual void setupAllArgs() override; cl_kernel getKernelHandle(); From 6541c2acbf525a91cbe138d9bf7954d2072f72fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henry=20Linjam=C3=A4ki?= Date: Tue, 12 Mar 2024 13:47:11 +0200 Subject: [PATCH 6/6] Eliminate redundant kernel argument copies ... within hipLaunchKernel() call. Along the way fix CHIPGraphNodeKernel instances didn't copy kernel arguments fully (they only copied pointers to arguments but not their values). --- src/CHIPBackend.cc | 9 ++------ src/CHIPBackend.hh | 6 ++--- src/CHIPGraph.cc | 18 ++++++++++----- src/CHIPGraph.hh | 6 +++++ src/SPIRVFuncInfo.cc | 22 ++++++++---------- src/SPIRVFuncInfo.hh | 12 ++++------ src/Utils.cc | 38 +++++++++++++++++++++++++++++++ src/Utils.hh | 3 +++ tests/runtime/TestArgVisitors.cpp | 21 +++++++++-------- 9 files changed, 88 insertions(+), 47 deletions(-) diff --git a/src/CHIPBackend.cc b/src/CHIPBackend.cc index e786bce90..7bfd5099f 100644 --- a/src/CHIPBackend.cc +++ b/src/CHIPBackend.cc @@ -34,7 +34,7 @@ static void queueKernel(chipstar::Queue *Q, chipstar::Kernel *K, ::Backend->createExecItem(GridDim, BlockDim, SharedMemSize, Q); EI->setKernel(K); - EI->copyArgs(Args); + EI->setArgs(Args); EI->setupAllArgs(); auto ChipQueue = EI->getQueue(); @@ -497,11 +497,6 @@ void *chipstar::ArgSpillBuffer ::allocate(const SPVFuncInfo::Arg &Arg) { // ExecItem //************************************************************************************* -void chipstar::ExecItem::copyArgs(void **Args) { - for (int i = 0; i < getNumArgs(); i++) { - Args_.push_back(Args[i]); - } -} chipstar::ExecItem::ExecItem(dim3 GridDim, dim3 BlockDim, size_t SharedMem, hipStream_t ChipQueue) @@ -1890,7 +1885,7 @@ void chipstar::Queue::launchKernel(chipstar::Kernel *ChipKernel, dim3 NumBlocks, chipstar::ExecItem *ExItem = ::Backend->createExecItem(NumBlocks, DimBlocks, SharedMemBytes, this); ExItem->setKernel(ChipKernel); - ExItem->copyArgs(Args); + ExItem->setArgs(Args); ExItem->setupAllArgs(); launch(ExItem); delete ExItem; diff --git a/src/CHIPBackend.hh b/src/CHIPBackend.hh index d0ecd4564..a87568018 100644 --- a/src/CHIPBackend.hh +++ b/src/CHIPBackend.hh @@ -1177,12 +1177,12 @@ protected: chipstar::Queue *ChipQueue_; - std::vector Args_; + void **Args_; std::shared_ptr ArgSpillBuffer_; public: - void copyArgs(void **Args); + void setArgs(void **Args) { Args_ = Args; } void setQueue(chipstar::Queue *Queue) { ChipQueue_ = Queue; } std::mutex ExecItemMtx; size_t getNumArgs() { @@ -1194,7 +1194,7 @@ public: /** * @brief Return argument list. */ - const std::vector &getArgs() const { return Args_; } + void **getArgs() const { return Args_; } /** * @brief Deleted default constructor diff --git a/src/CHIPGraph.cc b/src/CHIPGraph.cc index 34d158367..e6ea9c299 100644 --- a/src/CHIPGraph.cc +++ b/src/CHIPGraph.cc @@ -125,18 +125,22 @@ CHIPGraphNodeKernel::CHIPGraphNodeKernel(const hipKernelNodeParams *TheParams) Params_.extra = TheParams->extra; Params_.func = TheParams->func; Params_.gridDim = TheParams->gridDim; - Params_.kernelParams = TheParams->kernelParams; Params_.sharedMemBytes = TheParams->sharedMemBytes; + auto Dev = Backend->getActiveDevice(); chipstar::Kernel *ChipKernel = Dev->findKernel(HostPtr(Params_.func)); if (!ChipKernel) CHIPERR_LOG_AND_THROW("Could not find requested kernel", hipErrorInvalidDeviceFunction); + + copyKernelArgs(ArgList_, ArgData_, TheParams->kernelParams, + *ChipKernel->getFuncInfo()); + Params_.kernelParams = ArgList_.data(); + ExecItem_ = Backend->createExecItem(Params_.gridDim, Params_.blockDim, Params_.sharedMemBytes, nullptr); ExecItem_->setKernel(ChipKernel); - - ExecItem_->copyArgs(TheParams->kernelParams); + ExecItem_->setArgs(TheParams->kernelParams); ExecItem_->setupAllArgs(); } @@ -149,7 +153,6 @@ CHIPGraphNodeKernel::CHIPGraphNodeKernel(const void *HostFunction, dim3 GridDim, Params_.extra = nullptr; Params_.func = const_cast(HostFunction); Params_.gridDim = GridDim; - Params_.kernelParams = Args; Params_.sharedMemBytes = SharedMem; auto Dev = Backend->getActiveDevice(); @@ -157,10 +160,13 @@ CHIPGraphNodeKernel::CHIPGraphNodeKernel(const void *HostFunction, dim3 GridDim, if (!ChipKernel) CHIPERR_LOG_AND_THROW("Could not find requested kernel", hipErrorInvalidDeviceFunction); + + copyKernelArgs(ArgList_, ArgData_, Args, *ChipKernel->getFuncInfo()); + Params_.kernelParams = ArgList_.data(); + ExecItem_ = Backend->createExecItem(GridDim, BlockDim, SharedMem, nullptr); ExecItem_->setKernel(ChipKernel); - - ExecItem_->copyArgs(Args); + ExecItem_->setArgs(Params_.kernelParams); ExecItem_->setupAllArgs(); } diff --git a/src/CHIPGraph.hh b/src/CHIPGraph.hh index 191d6b304..2c4370a4b 100644 --- a/src/CHIPGraph.hh +++ b/src/CHIPGraph.hh @@ -261,6 +261,12 @@ public: class CHIPGraphNodeKernel : public CHIPGraphNode { private: + /// A block holding the bytes of the kernel arguments. + std::vector ArgData_; + + /// pointer to start of the kernel argument data for each kernel argument. + std::vector ArgList_; + hipKernelNodeParams Params_; chipstar::ExecItem *ExecItem_; diff --git a/src/SPIRVFuncInfo.cc b/src/SPIRVFuncInfo.cc index 445e2935a..50c89f054 100644 --- a/src/SPIRVFuncInfo.cc +++ b/src/SPIRVFuncInfo.cc @@ -85,7 +85,7 @@ std::string_view SPVFuncInfo::Arg::getKindAsString() const { } /// Client side kernel argument visitor. -void SPVFuncInfo::visitClientArgsImpl(const std::vector &ClientArgList, +void SPVFuncInfo::visitClientArgsImpl(void **ClientArgList, ClientArgVisitor Visitor) const { unsigned ArgListIndex = 0; @@ -113,12 +113,10 @@ void SPVFuncInfo::visitClientArgsImpl(const std::vector &ClientArgList, // Image argument replaced hipTextureObject_t argument. ArgKind = SPVTypeKind::Pointer; - auto *ArgData = - ClientArgList.empty() ? nullptr : ClientArgList[ArgListIndex]; + auto *ArgData = !ClientArgList ? nullptr : ClientArgList[ArgListIndex]; // Clang generated argument list should not have nullptrs in it. - assert((ClientArgList.empty() || ArgData) && - "nullptr in the argument list"); + assert((!ClientArgList || ArgData) && "nullptr in the argument list"); ClientArg CArg{ {{ArgKind, ArgTI.StorageClass, ArgSize}, ArgListIndex, ArgData}}; @@ -128,19 +126,18 @@ void SPVFuncInfo::visitClientArgsImpl(const std::vector &ClientArgList, } /// Visit client-visible kernel arguments -void SPVFuncInfo::visitClientArgs(const std::vector &ClientArgList, +void SPVFuncInfo::visitClientArgs(void **ClientArgList, ClientArgVisitor Visitor) const { - assert(ClientArgList.size() == getNumClientArgs()); visitClientArgsImpl(ClientArgList, Visitor); } /// Visit client-visible kernel arguments without the argument value /// (Arg::Data will be nullptr). void SPVFuncInfo::visitClientArgs(ClientArgVisitor Visitor) const { - visitClientArgsImpl(std::vector(), Visitor); + visitClientArgsImpl(nullptr, Visitor); } -void SPVFuncInfo::visitKernelArgsImpl(const std::vector &ClientArgList, +void SPVFuncInfo::visitKernelArgsImpl(void **ClientArgList, KernelArgVisitor Visitor) const { unsigned ArgIndex = 0; unsigned ArgListIndex = 0; @@ -156,7 +153,7 @@ void SPVFuncInfo::visitKernelArgsImpl(const std::vector &ClientArgList, ArgListIndex--; const void *ArgData = nullptr; - if (!ClientArgList.empty() && !ArgTI.isWorkgroupPtr()) { + if (ClientArgList && !ArgTI.isWorkgroupPtr()) { ArgData = ClientArgList[ArgListIndex]; // Clang geerated argument list should not have nullptrs in it. @@ -172,15 +169,14 @@ void SPVFuncInfo::visitKernelArgsImpl(const std::vector &ClientArgList, } // Visit kernel arguments -void SPVFuncInfo::visitKernelArgs(const std::vector &ClientArgList, +void SPVFuncInfo::visitKernelArgs(void **ClientArgList, KernelArgVisitor Visitor) const { - assert(ClientArgList.size() == getNumClientArgs()); visitKernelArgsImpl(ClientArgList, Visitor); } /// Visit kernel arguments without argument list (Arg::Data will be nullptr) void SPVFuncInfo::visitKernelArgs(KernelArgVisitor Visitor) const { - visitKernelArgsImpl(std::vector(), Visitor); + visitKernelArgsImpl(nullptr, Visitor); } /// Return HIP user visible kernel argument count. diff --git a/src/SPIRVFuncInfo.hh b/src/SPIRVFuncInfo.hh index c62ec2a8f..c6883dd7d 100644 --- a/src/SPIRVFuncInfo.hh +++ b/src/SPIRVFuncInfo.hh @@ -101,11 +101,9 @@ public: SPVFuncInfo(const std::vector &Info) : ArgTypeInfo_(Info) {} void visitClientArgs(ClientArgVisitor Fn) const; - void visitClientArgs(const std::vector &ArgList, - ClientArgVisitor Fn) const; + void visitClientArgs(void **ArgList, ClientArgVisitor Fn) const; void visitKernelArgs(KernelArgVisitor Fn) const; - void visitKernelArgs(const std::vector &ArgList, - KernelArgVisitor Fn) const; + void visitKernelArgs(void **ArgList, KernelArgVisitor Fn) const; /// Return visible kernel argument count. /// @@ -120,10 +118,8 @@ public: bool hasByRefArgs() const noexcept { return HasByRefArgs_; } private: - void visitClientArgsImpl(const std::vector &ArgList, - ClientArgVisitor Fn) const; - void visitKernelArgsImpl(const std::vector &ArgList, - KernelArgVisitor Fn) const; + void visitClientArgsImpl(void **ArgList, ClientArgVisitor Fn) const; + void visitKernelArgsImpl(void **ArgList, KernelArgVisitor Fn) const; }; typedef std::map> SPVFuncInfoMap; diff --git a/src/Utils.cc b/src/Utils.cc index 6d771f24b..43f7a7f30 100644 --- a/src/Utils.cc +++ b/src/Utils.cc @@ -259,3 +259,41 @@ bool startsWith(std::string_view Str, std::string_view WithStr) { return Str.size() >= WithStr.size() && Str.substr(0, WithStr.size()) == WithStr; } + +/// Deep copies kernel arguments pointed by 'CopyArg'. Bytes of the +/// argument values are stored in 'ArgData'. 'ArgList[I]' points to +/// the argument value in 'ArgData' for Ith kernel argument. +void copyKernelArgs(std::vector &ArgList, std::vector &ArgData, + void **CopyFrom, const SPVFuncInfo &FuncInfo) { + + ArgList.clear(); + ArgData.clear(); + + std::vector Offsets; + size_t CurrOffset = 0; + + auto CopyArgData = [&](const SPVFuncInfo::ClientArg &Arg) { + assert((Arg.Kind == SPVTypeKind::POD || Arg.Kind == SPVTypeKind::Pointer) && + "Unexpected argument kind."); + + size_t Size = Arg.Size; + size_t Alignment = roundUpToPowerOfTwo(Size); + assert(Size && Alignment && "Invalid arg size or alignment!"); + + CurrOffset = roundUp(CurrOffset, Alignment); + logDebug("arg {} tgt offset: {}", Arg.Index, CurrOffset); + Offsets.push_back(CurrOffset); + assert(CurrOffset >= ArgData.size()); + + ArgData.resize(CurrOffset + Size, 0); + std::memcpy(ArgData.data() + CurrOffset, Arg.Data, Size); + + CurrOffset += Size; + }; + FuncInfo.visitClientArgs(CopyFrom, CopyArgData); + + ArgList.reserve(Offsets.size()); + char *BasePtr = ArgData.data(); + for (auto Offset : Offsets) + ArgList.push_back(static_cast(BasePtr + Offset)); +} diff --git a/src/Utils.hh b/src/Utils.hh index 0c32676de..a090f1829 100644 --- a/src/Utils.hh +++ b/src/Utils.hh @@ -144,4 +144,7 @@ template struct PointerCmp { } }; +void copyKernelArgs(std::vector &ArgList, std::vector &ArgData, + void **CopyFrom, const SPVFuncInfo &FuncInfo); + #endif diff --git a/tests/runtime/TestArgVisitors.cpp b/tests/runtime/TestArgVisitors.cpp index 3a135b572..eb5a16a65 100644 --- a/tests/runtime/TestArgVisitors.cpp +++ b/tests/runtime/TestArgVisitors.cpp @@ -49,7 +49,8 @@ int main() { // Simulate client-side arguments. int a, b, c, d, e, f; - std::vector ArgList{&a, &b, &c, &d, &e, &f}; + std::vector ArgListVec{&a, &b, &c, &d, &e, &f}; + void **ArgList = static_cast(ArgListVec.data()); // Test visitors. @@ -66,7 +67,7 @@ int main() { ArgIdx = 0; FI.visitClientArgs(ArgList, [&](const SPVFuncInfo::ClientArg &Arg) { assert(Arg.Index == ArgIdx++); - assert(Arg.Data == ArgList.at(Arg.Index)); + assert(Arg.Data == ArgListVec.at(Arg.Index)); if (Arg.Index == 0) assert(Arg.Kind == SPVTypeKind::Pointer); else if (Arg.Index == 1) @@ -101,28 +102,28 @@ int main() { if (Arg.Index == 0) { assert(Arg.Kind == SPVTypeKind::Pointer); - assert(Arg.Data == ArgList.at(0)); + assert(Arg.Data == ArgListVec.at(0)); } else if (Arg.Index == 1) { assert(Arg.Kind == SPVTypeKind::Image); - assert(Arg.Data == ArgList.at(1)); + assert(Arg.Data == ArgListVec.at(1)); } else if (Arg.Index == 2) { assert(Arg.Kind == SPVTypeKind::Sampler); - assert(Arg.Data == ArgList.at(1)); + assert(Arg.Data == ArgListVec.at(1)); } else if (Arg.Index == 3) { assert(Arg.Kind == SPVTypeKind::POD); - assert(Arg.Data == ArgList.at(2)); + assert(Arg.Data == ArgListVec.at(2)); } else if (Arg.Index == 4) { assert(Arg.Kind == SPVTypeKind::Image); - assert(Arg.Data == ArgList.at(3)); + assert(Arg.Data == ArgListVec.at(3)); } else if (Arg.Index == 5) { assert(Arg.Kind == SPVTypeKind::Sampler); - assert(Arg.Data == ArgList.at(3)); + assert(Arg.Data == ArgListVec.at(3)); } else if (Arg.Index == 6) { assert(Arg.Kind == SPVTypeKind::POD); - assert(Arg.Data == ArgList.at(4)); + assert(Arg.Data == ArgListVec.at(4)); } else if (Arg.Index == 7) { assert(Arg.Kind == SPVTypeKind::PODByRef); - assert(Arg.Data == ArgList.at(5)); + assert(Arg.Data == ArgListVec.at(5)); } else if (Arg.Index == 8) { assert(Arg.Kind == SPVTypeKind::Pointer); assert(Arg.isWorkgroupPtr());