Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various small optimizations #816

Merged
merged 6 commits into from
May 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,18 @@ else()
set(CHIP_LIB_NAME "libCHIP.a")
endif()

include(CheckIPOSupported)
check_ipo_supported(RESULT SUPPORTS_IPO OUTPUT IPO_ERROR)
message(STATUS "Interprocedural optimizations (IPO): ${SUPPORTS_IPO}.")
if(SUPPORTS_IPO)
set_property(TARGET CHIP PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
else()
message(NOTICE "To enable IPO with clang you may need to re-configure \
LLVM/Clang with -DLLVM_BINUTILS_INCDIR for locating plugin-api.h from \
binutils-dev package.")
message(STATUS "IPO error: ${IPO_ERROR}")
endif()

set(CHIP_INTERFACE_LIBS ${PTHREAD_LIBRARY})

if(OpenCL_LIBRARY)
Expand Down
56 changes: 34 additions & 22 deletions src/CHIPBackend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ static void queueKernel(chipstar::Queue *Q, chipstar::Kernel *K,
::Backend->createExecItem(GridDim, BlockDim, SharedMemSize, Q);
EI->setKernel(K);

EI->copyArgs(Args);
EI->setArgs(Args);
EI->setupAllArgs();

auto ChipQueue = EI->getQueue();
Expand Down Expand Up @@ -167,6 +167,7 @@ void chipstar::AllocationTracker::recordAllocation(
// Map onto host so that the data can be potentially initialized on host
::Backend->getActiveDevice()->getDefaultQueue()->MemMap(
AllocInfo, chipstar::Queue::MEM_MAP_TYPE::HOST_WRITE);
NumHostAllocations_ += 1;
}

if (MemoryType == hipMemoryTypeUnified)
Expand Down Expand Up @@ -496,11 +497,6 @@ void *chipstar::ArgSpillBuffer ::allocate(const SPVFuncInfo::Arg &Arg) {

// ExecItem
//*************************************************************************************
void chipstar::ExecItem::copyArgs(void **Args) {
for (int i = 0; i < getNumArgs(); i++) {
Args_.push_back(Args[i]);
}
}

chipstar::ExecItem::ExecItem(dim3 GridDim, dim3 BlockDim, size_t SharedMem,
hipStream_t ChipQueue)
Expand Down Expand Up @@ -531,10 +527,9 @@ chipstar::Device::~Device() {
if (PerThreadDefaultQueue)
PerThreadDefaultQueue->finish();

while (this->ChipQueues_.size() > 0) {
delete ChipQueues_[0];
ChipQueues_.erase(ChipQueues_.begin());
}
for (auto *Queue : UserQueues_)
delete Queue;
UserQueues_.clear();

delete LegacyDefaultQueue;
LegacyDefaultQueue = nullptr;
Expand Down Expand Up @@ -848,9 +843,9 @@ void chipstar::Device::addQueue(chipstar::Queue *ChipQueue) {
logDebug("{} Device::addQueue({})", (void *)this, (void *)ChipQueue);

auto QueueFound =
std::find(ChipQueues_.begin(), ChipQueues_.end(), ChipQueue);
if (QueueFound == ChipQueues_.end()) {
ChipQueues_.push_back(ChipQueue);
std::find(UserQueues_.begin(), UserQueues_.end(), ChipQueue);
if (QueueFound == UserQueues_.end()) {
UserQueues_.push_back(ChipQueue);
} else {
CHIPERR_LOG_AND_THROW("Tried to add a queue to the backend which was "
"already present in the backend queue list",
Expand Down Expand Up @@ -918,19 +913,19 @@ bool chipstar::Device::removeQueue(chipstar::Queue *ChipQueue) {
*
* Choosing not to call Queue->finish()
*/
LOCK(DeviceMtx) // reading chipstar::Device::ChipQueues_
LOCK(DeviceMtx) // reading chipstar::Device::UserQueues_
ChipQueue->updateLastEvent(nullptr);

// Remove from device queue list
auto FoundQueue =
std::find(ChipQueues_.begin(), ChipQueues_.end(), ChipQueue);
if (FoundQueue == ChipQueues_.end()) {
std::find(UserQueues_.begin(), UserQueues_.end(), ChipQueue);
if (FoundQueue == UserQueues_.end()) {
std::string Msg =
"Tried to remove a queue for a device but the queue was not found in "
"device queue list";
CHIPERR_LOG_AND_THROW(Msg, hipErrorUnknown);
}
ChipQueues_.erase(FoundQueue);
UserQueues_.erase(FoundQueue);

delete ChipQueue;
return true;
Expand Down Expand Up @@ -1491,13 +1486,23 @@ void chipstar::Queue::updateLastEvent(
LastEvent_ = NewEvent;
}

/// Return a list of events from other queues that the current queue needs to
/// synchronize with for modeling the implicit synchronization behavior of the
/// NULL stream. Called queue's last event is included if 'IncludeSelfLastEvent'
/// is true.
std::pair<chipstar::SharedEventVector, chipstar::LockGuardVector>
chipstar::Queue::getSyncQueuesLastEvents(
std::shared_ptr<chipstar::Event> Event) {
chipstar::Queue::getSyncQueuesLastEvents(std::shared_ptr<chipstar::Event> Event,
bool IncludeSelfLastEvent) {

std::vector<std::shared_ptr<chipstar::Event>> EventsToWaitOn;
std::vector<std::unique_ptr<std::unique_lock<std::mutex>>> EventLocks;

// No need for default-stream implicit synchronization if there are
// no user created blocking queues.
auto NumUserQueues = ChipDevice_->getNumUserQueues();
if (!NumUserQueues && !IncludeSelfLastEvent)
return {EventsToWaitOn, std::move(EventLocks)};

EventLocks.push_back(std::make_unique<std::unique_lock<std::mutex>>(
::Backend->GlobalLastEventMtx));
EventLocks.push_back(
Expand All @@ -1516,6 +1521,9 @@ chipstar::Queue::getSyncQueuesLastEvents(
EventLocks.push_back(
std::make_unique<std::unique_lock<std::mutex>>(Event->EventMtx));

if (!NumUserQueues)
return {EventsToWaitOn, std::move(EventLocks)};

// If this stream is default legacy stream, sync with all other streams on
// this device
if (this->isDefaultLegacyQueue() || this->isDefaultPerThreadQueue()) {
Expand Down Expand Up @@ -1758,9 +1766,13 @@ chipstar::Queue::RegisteredVarCopy(chipstar::ExecItem *ExecItem,
// the kernel does not have any, we only need inspect kernels
// pointer arguments for allocations to be synchronized.

std::vector<std::shared_ptr<chipstar::Event>> CopyEvents;
auto *AllocTracker = ::Backend->getActiveDevice()->AllocTracker;
if (!AllocTracker->getNumHostAllocations() &&
!AllocTracker->getNumManagedAllocations())
return nullptr; // Nothing to synchronize.

auto PreKernel = ExecState == MANAGED_MEM_STATE::PRE_KERNEL;
auto &AllocTracker = ::Backend->getActiveDevice()->AllocTracker;
std::vector<std::shared_ptr<chipstar::Event>> CopyEvents;
auto ArgVisitor = [&](const chipstar::AllocationInfo &AllocInfo) -> void {
if (AllocInfo.MemoryType == hipMemoryTypeHost) {
logDebug("Sync host memory {} ({})", AllocInfo.HostPtr,
Expand Down Expand Up @@ -1873,7 +1885,7 @@ void chipstar::Queue::launchKernel(chipstar::Kernel *ChipKernel, dim3 NumBlocks,
chipstar::ExecItem *ExItem =
::Backend->createExecItem(NumBlocks, DimBlocks, SharedMemBytes, this);
ExItem->setKernel(ChipKernel);
ExItem->copyArgs(Args);
ExItem->setArgs(Args);
ExItem->setupAllArgs();
launch(ExItem);
delete ExItem;
Expand Down
42 changes: 35 additions & 7 deletions src/CHIPBackend.hh
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,9 @@
std::unordered_set<chipstar::AllocationInfo *> AllocInfos_;
std::unordered_map<void *, chipstar::AllocationInfo *> PtrToAllocInfo_;

size_t NumHostAllocations_ = 0;
size_t NumManagedAllocations_ = 0;

public:
mutable std::mutex AllocationTrackerMtx;

Expand All @@ -522,6 +525,7 @@
this->PtrToAllocInfo_[HostPtr] = AllocInfo;
AllocInfo->MemoryType = hipMemoryTypeManaged;
AllocInfo->IsHostRegistered = true;
NumManagedAllocations_ += 1;
}

size_t GlobalMemSize, TotalMemSize, MaxMemUsed;
Expand Down Expand Up @@ -604,6 +608,20 @@
assert(AllocInfo && "Null pointer passed to eraseRecord");
assert(AllocInfos_.count(AllocInfo) &&
"Not a member of the allocation tracker!");

switch (AllocInfo->MemoryType) {
default:
break;
case hipMemoryTypeHost:
assert(NumHostAllocations_ > 0);
NumHostAllocations_ -= 1;
break;
case hipMemoryTypeManaged:
assert(NumManagedAllocations_ > 0);
NumManagedAllocations_ -= 1;
break;
}

PtrToAllocInfo_.erase(AllocInfo->DevPtr);
if (AllocInfo->HostPtr)
PtrToAllocInfo_.erase(AllocInfo->HostPtr);
Expand All @@ -623,6 +641,10 @@
}

size_t getNumAllocations() const { return AllocInfos_.size(); }

// Return the number of host type allocations.
size_t getNumHostAllocations() const { return NumHostAllocations_; }
size_t getNumManagedAllocations() const { return NumManagedAllocations_; }
};

class DeviceVar {
Expand Down Expand Up @@ -1155,12 +1177,12 @@

chipstar::Queue *ChipQueue_;

std::vector<void *> Args_;
void **Args_;

std::shared_ptr<chipstar::ArgSpillBuffer> ArgSpillBuffer_;

public:
void copyArgs(void **Args);
void setArgs(void **Args) { Args_ = Args; }
void setQueue(chipstar::Queue *Queue) { ChipQueue_ = Queue; }
std::mutex ExecItemMtx;
size_t getNumArgs() {
Expand All @@ -1172,7 +1194,7 @@
/**
* @brief Return argument list.
*/
const std::vector<void *> &getArgs() const { return Args_; }
void **getArgs() const { return Args_; }

Check warning on line 1197 in src/CHIPBackend.hh

View workflow job for this annotation

GitHub Actions / cpp-linter

src/CHIPBackend.hh:1197:10 [modernize-use-trailing-return-type]

use a trailing return type for this function

/**
* @brief Deleted default constructor
Expand Down Expand Up @@ -1288,7 +1310,10 @@
protected:
std::string DeviceName_;
chipstar::Context *Ctx_;
std::vector<chipstar::Queue *> ChipQueues_;

/// List of user created queues.
std::vector<chipstar::Queue *> UserQueues_;

std::once_flag PropsPopulated_;

hipDeviceAttribute_t Attrs_;
Expand Down Expand Up @@ -1319,7 +1344,10 @@
std::mutex DeviceMtx;
std::mutex QueueAddRemoveMtx;

std::vector<chipstar::Queue *> getQueuesNoLock() { return ChipQueues_; }
std::vector<chipstar::Queue *> getQueuesNoLock() { return UserQueues_; }

/// Return the number of user created queues.
size_t getNumUserQueues() const noexcept { return UserQueues_.size(); }

chipstar::Queue *LegacyDefaultQueue;
inline static thread_local std::unique_ptr<chipstar::Queue>
Expand Down Expand Up @@ -2109,9 +2137,9 @@
isPerThreadDefaultQueue_ = Status;
}

std::pair<SharedEventVector, LockGuardVector> getSyncQueuesLastEvents();
std::pair<SharedEventVector, LockGuardVector>
getSyncQueuesLastEvents(std::shared_ptr<chipstar::Event> LastEvent);
getSyncQueuesLastEvents(std::shared_ptr<chipstar::Event> LastEvent,
bool IncludeSelfLastEvent);
enum MEM_MAP_TYPE { HOST_READ, HOST_WRITE, HOST_READ_WRITE };
virtual void MemMap(const chipstar::AllocationInfo *AllocInfo,
MEM_MAP_TYPE MapType) {}
Expand Down
18 changes: 12 additions & 6 deletions src/CHIPGraph.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,18 +125,22 @@
Params_.extra = TheParams->extra;
Params_.func = TheParams->func;
Params_.gridDim = TheParams->gridDim;
Params_.kernelParams = TheParams->kernelParams;
Params_.sharedMemBytes = TheParams->sharedMemBytes;

auto Dev = Backend->getActiveDevice();

Check warning on line 130 in src/CHIPGraph.cc

View workflow job for this annotation

GitHub Actions / cpp-linter

src/CHIPGraph.cc:130:3 [readability-qualified-auto]

'auto Dev' can be declared as 'auto *Dev'
chipstar::Kernel *ChipKernel = Dev->findKernel(HostPtr(Params_.func));
if (!ChipKernel)
CHIPERR_LOG_AND_THROW("Could not find requested kernel",
hipErrorInvalidDeviceFunction);

copyKernelArgs(ArgList_, ArgData_, TheParams->kernelParams,
*ChipKernel->getFuncInfo());
Params_.kernelParams = ArgList_.data();

ExecItem_ = Backend->createExecItem(Params_.gridDim, Params_.blockDim,
Params_.sharedMemBytes, nullptr);
ExecItem_->setKernel(ChipKernel);

ExecItem_->copyArgs(TheParams->kernelParams);
ExecItem_->setArgs(TheParams->kernelParams);
ExecItem_->setupAllArgs();
}

Expand All @@ -149,18 +153,20 @@
Params_.extra = nullptr;
Params_.func = const_cast<void *>(HostFunction);
Params_.gridDim = GridDim;
Params_.kernelParams = Args;
Params_.sharedMemBytes = SharedMem;

auto Dev = Backend->getActiveDevice();
chipstar::Kernel *ChipKernel = Dev->findKernel(HostPtr(HostFunction));
if (!ChipKernel)
CHIPERR_LOG_AND_THROW("Could not find requested kernel",
hipErrorInvalidDeviceFunction);

copyKernelArgs(ArgList_, ArgData_, Args, *ChipKernel->getFuncInfo());
Params_.kernelParams = ArgList_.data();

ExecItem_ = Backend->createExecItem(GridDim, BlockDim, SharedMem, nullptr);
ExecItem_->setKernel(ChipKernel);

ExecItem_->copyArgs(Args);
ExecItem_->setArgs(Params_.kernelParams);
ExecItem_->setupAllArgs();
}

Expand Down
6 changes: 6 additions & 0 deletions src/CHIPGraph.hh
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,12 @@ public:

class CHIPGraphNodeKernel : public CHIPGraphNode {
private:
/// A block holding the bytes of the kernel arguments.
std::vector<char> ArgData_;

/// pointer to start of the kernel argument data for each kernel argument.
std::vector<void *> ArgList_;

hipKernelNodeParams Params_;
chipstar::ExecItem *ExecItem_;

Expand Down
Loading
Loading