From 2ba6d5f0145eebefeb0b8c0718df7d7560b68173 Mon Sep 17 00:00:00 2001 From: Zeyu Li Date: Wed, 15 Feb 2023 18:14:41 +0800 Subject: [PATCH] [Amdgpu] Add amdgpu backend profiler (#7330) Issue: #https://github.com/taichi-dev/taichi/issues/6434 ### Brief Summary 1. Currently only default(event) profiler is available on AMDGPU 2. Here is the show image --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- taichi/program/kernel_profiler.cpp | 8 + taichi/rhi/amdgpu/CMakeLists.txt | 1 + taichi/rhi/amdgpu/amdgpu_context.cpp | 17 ++ taichi/rhi/amdgpu/amdgpu_context.h | 5 + taichi/rhi/amdgpu/amdgpu_profiler.cpp | 252 ++++++++++++++++++ taichi/rhi/amdgpu/amdgpu_profiler.h | 77 ++++++ taichi/rhi/amdgpu/amdgpu_types.h | 15 ++ taichi/rhi/cuda/cuda_profiler.cpp | 9 +- taichi/runtime/llvm/llvm_runtime_executor.cpp | 5 + 9 files changed, 385 insertions(+), 4 deletions(-) create mode 100644 taichi/rhi/amdgpu/amdgpu_profiler.cpp create mode 100644 taichi/rhi/amdgpu/amdgpu_profiler.h create mode 100644 taichi/rhi/amdgpu/amdgpu_types.h diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp index 492162e6e248e..8c5f2653ee222 100644 --- a/taichi/program/kernel_profiler.cpp +++ b/taichi/program/kernel_profiler.cpp @@ -5,6 +5,8 @@ #include "taichi/rhi/cuda/cuda_profiler.h" #include "taichi/system/timeline.h" +#include "taichi/rhi/amdgpu/amdgpu_profiler.h" + namespace taichi::lang { void KernelProfileStatisticalResult::insert_record(double t) { @@ -143,6 +145,12 @@ std::unique_ptr make_profiler(Arch arch, bool enable) { return std::make_unique(enable); #else TI_NOT_IMPLEMENTED; +#endif + } else if (arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + return std::make_unique(); +#else + TI_NOT_IMPLEMENTED #endif } else { return std::make_unique(); diff --git a/taichi/rhi/amdgpu/CMakeLists.txt b/taichi/rhi/amdgpu/CMakeLists.txt index 8c6e42417bb13..1bd646bb3b5d2 100644 --- a/taichi/rhi/amdgpu/CMakeLists.txt +++ b/taichi/rhi/amdgpu/CMakeLists.txt @@ -8,6 +8,7 @@ target_sources(${AMDGPU_RHI} amdgpu_caching_allocator.cpp amdgpu_context.cpp amdgpu_driver.cpp + amdgpu_profiler.cpp ) target_include_directories(${AMDGPU_RHI} diff --git a/taichi/rhi/amdgpu/amdgpu_context.cpp b/taichi/rhi/amdgpu/amdgpu_context.cpp index e85bdef9ae90d..957d83d8ef024 100644 --- a/taichi/rhi/amdgpu/amdgpu_context.cpp +++ b/taichi/rhi/amdgpu/amdgpu_context.cpp @@ -8,7 +8,9 @@ #include "taichi/program/program.h" #include "taichi/system/threading.h" #include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_profiler.h" #include "taichi/analysis/offline_cache_util.h" +#include "taichi/util/offline_cache.h" namespace taichi { namespace lang { @@ -120,6 +122,17 @@ void AMDGPUContext::launch(void *func, unsigned grid_dim, unsigned block_dim, std::size_t dynamic_shared_mem_bytes) { + KernelProfilerBase::TaskHandle task_handle; + // Kernel launch + if (profiler_) { + KernelProfilerAMDGPU *profiler_amdgpu = + dynamic_cast(profiler_); + std::string primal_task_name, key; + bool valid = + offline_cache::try_demangle_name(task_name, primal_task_name, key); + profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name, + func, grid_dim, block_dim, 0); + } auto pack_size = get_args_byte(arg_sizes); char *packed_arg = (char *)std::malloc(pack_size); pack_args(arg_pointers, arg_sizes, packed_arg); @@ -132,6 +145,10 @@ void AMDGPUContext::launch(void *func, reinterpret_cast(&config)); } std::free(packed_arg); + + if (profiler_) + profiler_->stop(task_handle); + if (debug_) { driver_.stream_synchronize(nullptr); } diff --git a/taichi/rhi/amdgpu/amdgpu_context.h b/taichi/rhi/amdgpu/amdgpu_context.h index 00c9d130cf0fc..d63bef17ae6ba 100644 --- a/taichi/rhi/amdgpu/amdgpu_context.h +++ b/taichi/rhi/amdgpu/amdgpu_context.h @@ -20,6 +20,7 @@ class AMDGPUContext { int compute_capability_; std::string mcpu_; std::mutex lock_; + KernelProfilerBase *profiler_; AMDGPUDriver &driver_; bool debug_; std::vector kernel_arg_pointer_; @@ -53,6 +54,10 @@ class AMDGPUContext { int get_args_byte(std::vector arg_sizes); + void set_profiler(KernelProfilerBase *profiler) { + profiler_ = profiler; + } + void launch(void *func, const std::string &task_name, const std::vector &arg_pointers, diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp new file mode 100644 index 0000000000000..ed9a4b8573a0c --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -0,0 +1,252 @@ +#include "taichi/rhi/amdgpu/amdgpu_profiler.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_context.h" +#include "taichi/rhi/amdgpu/amdgpu_types.h" + +namespace taichi::lang { +#if defined(TI_WITH_AMDGPU) + +std::string KernelProfilerAMDGPU::get_device_name() { + return AMDGPUContext::get_instance().get_device_name(); +} + +bool KernelProfilerAMDGPU::reinit_with_metrics( + const std::vector metrics) { + TI_NOT_IMPLEMENTED +} + +bool KernelProfilerAMDGPU::set_profiler_toolkit(std::string toolkit_name) { + if (toolkit_name.compare("default") == 0) { + return true; + } + TI_WARN("Only default(event) profiler is allowed on AMDGPU"); + return false; +} + +KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( + const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; +} + +void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size) { + int register_per_thread = 0; + int static_shared_mem_per_block = 0; + // int max_active_blocks_per_multiprocessor = 0; + task_handle = event_toolkit_->start_with_handle(kernel_name); + KernelProfileTracedRecord record; + + AMDGPUDriver::get_instance().kernel_get_attribute( + ®ister_per_thread, HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS, + kernel); + AMDGPUDriver::get_instance().kernel_get_attribute( + &static_shared_mem_per_block, + HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); + // kernel_get_occupancy doesn't work well + // AMDGPUDriver::get_instance().kernel_get_occupancy( + // &max_active_blocks_per_multiprocessor, kernel, block_size, + // dynamic_smem_size); + + record.name = kernel_name; + record.register_per_thread = register_per_thread; + record.shared_mem_per_block = static_shared_mem_per_block + dynamic_smem_size; + record.grid_size = grid_size; + record.block_size = block_size; + // record.active_blocks_per_multiprocessor = + // max_active_blocks_per_multiprocessor; + + traced_records_.push_back(record); +} + +void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { + AMDGPUDriver::get_instance().event_record(handle, 0); + AMDGPUDriver::get_instance().stream_synchronize(nullptr); + + // get elapsed time and destroy events + auto record = event_toolkit_->get_current_event_record(); + AMDGPUDriver::get_instance().event_elapsed_time( + &record->kernel_elapsed_time_in_ms, record->start_event, handle); + AMDGPUDriver::get_instance().event_elapsed_time( + &record->time_since_base, event_toolkit_->get_base_event(), + record->start_event); + + AMDGPUDriver::get_instance().event_destroy(record->start_event); + AMDGPUDriver::get_instance().event_destroy(record->stop_event); +} + +bool KernelProfilerAMDGPU::statistics_on_traced_records() { + for (auto &record : traced_records_) { + auto it = + std::find_if(statistical_results_.begin(), statistical_results_.end(), + [&](KernelProfileStatisticalResult &result) { + return result.name == record.name; + }); + if (it == statistical_results_.end()) { + statistical_results_.emplace_back(record.name); + it = std::prev(statistical_results_.end()); + } + it->insert_record(record.kernel_elapsed_time_in_ms); + total_time_ms_ += record.kernel_elapsed_time_in_ms; + } + + return true; +} + +void KernelProfilerAMDGPU::sync() { + AMDGPUDriver::get_instance().stream_synchronize(nullptr); +} + +void KernelProfilerAMDGPU::update() { + event_toolkit_->update_record(records_size_after_sync_, traced_records_); + event_toolkit_->update_timeline(traced_records_); + statistics_on_traced_records(); + event_toolkit_->clear(); + records_size_after_sync_ = traced_records_.size(); +} + +void KernelProfilerAMDGPU::clear() { + update(); + total_time_ms_ = 0; + records_size_after_sync_ = 0; + traced_records_.clear(); + statistical_results_.clear(); +} + +#else +std::string KernelProfilerAMDGPU::get_device_name() { + TI_NOT_IMPLEMENTED +} + +bool KernelProfilerAMDGPU::reinit_with_metrics( + const std::vector metrics){TI_NOT_IMPLEMENTED} + +KernelProfilerBase::TaskHandle + KernelProfilerAMDGPU::start_with_handle(const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; +} + +void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size) { + TI_NOT_IMPLEMENTED; +} + +void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { + TI_NOT_IMPLEMENTED +} + +bool KernelProfilerAMDGPU::statistics_on_traced_records() { + TI_NOT_IMPLEMENTED +} + +void KernelProfilerAMDGPU::sync() { + TI_NOT_IMPLEMENTED +} +void KernelProfilerAMDGPU::update() { + TI_NOT_IMPLEMENTED +} + +void KernelProfilerAMDGPU::clear(){TI_NOT_IMPLEMENTED} + +#endif + +#if defined(TI_WITH_AMDGPU) + +KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle( + const std::string &kernel_name) { + EventRecord record; + record.name = kernel_name; + + AMDGPUDriver::get_instance().event_create(&(record.start_event), + HIP_EVENT_DEFAULT); + AMDGPUDriver::get_instance().event_create(&(record.stop_event), + HIP_EVENT_DEFAULT); + AMDGPUDriver::get_instance().event_record((record.start_event), 0); + event_records_.push_back(record); + + if (!base_event_) { + int n_iters = 100; + // Warm up + for (int i = 0; i < n_iters; i++) { + void *e; + AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT); + AMDGPUDriver::get_instance().event_record(e, 0); + AMDGPUDriver::get_instance().event_synchronize(e); + auto final_t = Time::get_time(); + if (i == n_iters - 1) { + base_event_ = e; + // ignore the overhead of sync, event_create and systematic time offset. + base_time_ = final_t; + } else { + AMDGPUDriver::get_instance().event_destroy(e); + } + } + } + return record.stop_event; +} + +void EventToolkitAMDGPU::update_record( + uint32_t records_size_after_sync, + std::vector &traced_records) { + uint32_t events_num = event_records_.size(); + uint32_t records_num = traced_records.size(); + TI_ERROR_IF( + records_size_after_sync + events_num != records_num, + "KernelProfilerAMDGPU::EventToolkitAMDGPU: event_records_.size({}) != " + "traced_records_.size({})", + records_size_after_sync + events_num, records_num); + + uint32_t idx = 0; + for (auto &record : event_records_) { + // copy to traced_records_ then clear event_records_ + traced_records[records_size_after_sync + idx].kernel_elapsed_time_in_ms = + record.kernel_elapsed_time_in_ms; + traced_records[records_size_after_sync + idx].time_since_base = + record.time_since_base; + idx++; + } +} + +void EventToolkitAMDGPU::update_timeline( + std::vector &traced_records) { + if (Timelines::get_instance().get_enabled()) { + auto &timeline = Timeline::get_this_thread_instance(); + for (auto &record : traced_records) { + timeline.insert_event({record.name, /*param_name=begin*/ true, + base_time_ + record.time_since_base * 1e-3, + "amdgpu"}); + timeline.insert_event({record.name, /*param_name=begin*/ false, + base_time_ + (record.time_since_base + + record.kernel_elapsed_time_in_ms) * + 1e-3, + "amdgpu"}); + } + } +} + +#else + +KernelProfilerBase::TaskHandle + EventToolkitAMDGPU::start_with_handle(const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; +} +void EventToolkitAMDGPU::update_record( + uint32_t records_size_after_sync, + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; +} +void EventToolkitAMDGPU::update_timeline( + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; +} + +#endif + +} // namespace taichi::lang diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h new file mode 100644 index 0000000000000..3de30c82ff3f9 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -0,0 +1,77 @@ +#pragma once + +#include "taichi/system/timeline.h" +#include "taichi/program/kernel_profiler.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_context.h" + +#include +#include + +namespace taichi::lang { +class EventToolkitAMDGPU; + +class KernelProfilerAMDGPU : public KernelProfilerBase { + public: + KernelProfilerAMDGPU() { + event_toolkit_ = std::make_unique(); + } + std::string get_device_name() override; + + bool reinit_with_metrics(const std::vector metrics) override; + void trace(KernelProfilerBase::TaskHandle &task_handle, + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size); + void sync() override; + void update() override; + void clear() override; + void stop(KernelProfilerBase::TaskHandle handle) override; + + bool set_profiler_toolkit(std::string toolkit_name) override; + + bool statistics_on_traced_records(); + + KernelProfilerBase::TaskHandle start_with_handle( + const std::string &kernel_name) override; + + private: + std::unique_ptr event_toolkit_{nullptr}; + uint32_t records_size_after_sync_{0}; +}; + +class EventToolkitAMDGPU { + public: + void update_record(uint32_t records_size_after_sync, + std::vector &traced_records); + KernelProfilerBase::TaskHandle start_with_handle( + const std::string &kernel_name); + void update_timeline(std::vector &traced_records); + void clear() { + event_records_.clear(); + } + + private: + struct EventRecord { + std::string name; + float kernel_elapsed_time_in_ms{0.0}; + float time_since_base{0.0}; + void *start_event{nullptr}; + void *stop_event{nullptr}; + }; + float64 base_time_{0.0}; + void *base_event_{nullptr}; + // for cuEvent profiling, clear after sync() + std::vector event_records_; + + public: + EventRecord *get_current_event_record() { + return &(event_records_.back()); + } + void *get_base_event() const { + return base_event_; + } +}; +} // namespace taichi::lang diff --git a/taichi/rhi/amdgpu/amdgpu_types.h b/taichi/rhi/amdgpu/amdgpu_types.h new file mode 100644 index 0000000000000..4d2c795842e42 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_types.h @@ -0,0 +1,15 @@ +#pragma once + +typedef enum HIPfunction_attribute_enum { + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + HIP_FUNC_ATTRIBUTE_NUM_REGS = 4, + HIP_FUNC_ATTRIBUTE_PTX_VERSION = 5, + HIP_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + HIP_FUNC_ATTRIBUTE_MAX +} HIPfunction_attribute; diff --git a/taichi/rhi/cuda/cuda_profiler.cpp b/taichi/rhi/cuda/cuda_profiler.cpp index 5bd11c2e0762f..774d462ce8f98 100644 --- a/taichi/rhi/cuda/cuda_profiler.cpp +++ b/taichi/rhi/cuda/cuda_profiler.cpp @@ -318,10 +318,11 @@ void EventToolkit::update_record( std::vector &traced_records) { uint32_t events_num = event_records_.size(); uint32_t records_num = traced_records.size(); - TI_ERROR_IF(records_size_after_sync + events_num != records_num, - "KernelProfilerCUDA::EventToolkit: event_records_.size({}) != " - "traced_records_.size({})", - records_size_after_sync + events_num, records_num); + TI_ERROR_IF( + records_size_after_sync + events_num != records_num, + "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " + "traced_records_.size({})", + records_size_after_sync + events_num, records_num); uint32_t idx = 0; for (auto &record : event_records_) { diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 2dd0c8f6b860c..f55d0bef68dfb 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -140,6 +140,11 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, if (config.saturating_grid_dim == 0) { config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2; } + if (config.kernel_profiler) { + AMDGPUContext::get_instance().set_profiler(profiler); + } else { + AMDGPUContext::get_instance().set_profiler(nullptr); + } AMDGPUContext::get_instance().set_debug(config.debug); device_ = std::make_shared(); }