From f779a4cd8f975b11bbc150bad2fef861cc171c6b Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 8 Feb 2023 13:44:41 +0800 Subject: [PATCH 01/15] refactor EventToolkit --- taichi/program/kernel_profiler.h | 39 +++++++++++++++++++++++++++ taichi/rhi/amdgpu/amdgpu_profiler.cpp | 0 taichi/rhi/amdgpu/amdgpu_profiler.h | 13 +++++++++ taichi/rhi/cuda/cuda_profiler.cpp | 14 +++++----- taichi/rhi/cuda/cuda_profiler.h | 36 +++++-------------------- 5 files changed, 65 insertions(+), 37 deletions(-) create mode 100644 taichi/rhi/amdgpu/amdgpu_profiler.cpp create mode 100644 taichi/rhi/amdgpu/amdgpu_profiler.h diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h index a2e18924c2b64..080c1e00b7892 100644 --- a/taichi/program/kernel_profiler.h +++ b/taichi/program/kernel_profiler.h @@ -103,6 +103,45 @@ class KernelProfilerBase { } }; +class EventToolkitBase { + public: + virtual void update_record(uint32_t records_size_after_sync, + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; + }; + virtual KernelProfilerBase::TaskHandle start_with_handle( + const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; + }; + virtual void update_timeline(std::vector &traced_records) { + TI_NOT_IMPLEMENTED; + }; + + private: + struct EventRecord { + std::string name; + float kernel_elapsed_time_in_ms{0.0}; + float time_since_base{0.0}; + void *start_event{nullptr}; + void *stop_event{nullptr}; + }; + float64 base_time_{0.0}; + void *base_event_{nullptr}; + // for cuEvent profiling, clear after sync() + std::vector event_records_; + + public: + void clear() { + event_records_.clear(); + } + EventRecord *get_current_event_record() { + return &(event_records_.back()); + } + void *get_base_event() const { + return base_event_; + } +}; + std::unique_ptr make_profiler(Arch arch, bool enable); } // namespace taichi::lang diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h new file mode 100644 index 0000000000000..486e2c8b9dda1 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -0,0 +1,13 @@ +#pragma once + +#include "taichi/system/timeline.h" +#include "taichi/program/kernel_profiler.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_context.h" + +#include +#include + +namespace taichi::lang { + +} diff --git a/taichi/rhi/cuda/cuda_profiler.cpp b/taichi/rhi/cuda/cuda_profiler.cpp index 5bd11c2e0762f..392aaa5b1f35b 100644 --- a/taichi/rhi/cuda/cuda_profiler.cpp +++ b/taichi/rhi/cuda/cuda_profiler.cpp @@ -271,7 +271,7 @@ bool KernelProfilerCUDA::record_kernel_attributes(void *kernel, // default profiling toolkit : cuEvent // for now put it together with KernelProfilerCUDA #if defined(TI_WITH_CUDA) -KernelProfilerBase::TaskHandle EventToolkit::start_with_handle( +KernelProfilerBase::TaskHandle EventToolkitCUDA::start_with_handle( const std::string &kernel_name) { EventRecord record; record.name = kernel_name; @@ -313,13 +313,13 @@ KernelProfilerBase::TaskHandle EventToolkit::start_with_handle( return record.stop_event; } -void EventToolkit::update_record( +void EventToolkitCUDA::update_record( uint32_t records_size_after_sync, std::vector &traced_records) { uint32_t events_num = event_records_.size(); uint32_t records_num = traced_records.size(); TI_ERROR_IF(records_size_after_sync + events_num != records_num, - "KernelProfilerCUDA::EventToolkit: event_records_.size({}) != " + "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " "traced_records_.size({})", records_size_after_sync + events_num, records_num); @@ -334,7 +334,7 @@ void EventToolkit::update_record( } } -void EventToolkit::update_timeline( +void EventToolkitCUDA::update_timeline( std::vector &traced_records) { if (Timelines::get_instance().get_enabled()) { auto &timeline = Timeline::get_this_thread_instance(); @@ -354,16 +354,16 @@ void EventToolkit::update_timeline( } #else -KernelProfilerBase::TaskHandle EventToolkit::start_with_handle( +KernelProfilerBase::TaskHandle EventToolkitCUDA::start_with_handle( const std::string &kernel_name) { TI_NOT_IMPLEMENTED; } -void EventToolkit::update_record( +void EventToolkitCUDA::update_record( uint32_t records_size_after_sync, std::vector &traced_records) { TI_NOT_IMPLEMENTED; } -void EventToolkit::update_timeline( +void EventToolkitCUDA::update_timeline( std::vector &traced_records) { TI_NOT_IMPLEMENTED; } diff --git a/taichi/rhi/cuda/cuda_profiler.h b/taichi/rhi/cuda/cuda_profiler.h index 69a03c85f512d..b531ea9c09b69 100644 --- a/taichi/rhi/cuda/cuda_profiler.h +++ b/taichi/rhi/cuda/cuda_profiler.h @@ -17,7 +17,7 @@ enum class ProfilingToolkit : int { cupti, }; -class EventToolkit; +class EventToolkitCUDA; // A CUDA kernel profiler class KernelProfilerCUDA : public KernelProfilerBase { @@ -55,44 +55,20 @@ class KernelProfilerCUDA : public KernelProfilerBase { // Instances of these toolkits may exist at the same time, // but only one will be enabled. - std::unique_ptr event_toolkit_{nullptr}; + std::unique_ptr event_toolkit_{nullptr}; std::unique_ptr cupti_toolkit_{nullptr}; std::vector metric_list_; uint32_t records_size_after_sync_{0}; }; // default profiling toolkit -class EventToolkit { +class EventToolkitCUDA : public EventToolkitBase { public: void update_record(uint32_t records_size_after_sync, - std::vector &traced_records); + std::vector &traced_records) override; KernelProfilerBase::TaskHandle start_with_handle( - const std::string &kernel_name); - void update_timeline(std::vector &traced_records); - void clear() { - event_records_.clear(); - } - - private: - struct EventRecord { - std::string name; - float kernel_elapsed_time_in_ms{0.0}; - float time_since_base{0.0}; - void *start_event{nullptr}; - void *stop_event{nullptr}; - }; - float64 base_time_{0.0}; - void *base_event_{nullptr}; - // for cuEvent profiling, clear after sync() - std::vector event_records_; - - public: - EventRecord *get_current_event_record() { - return &(event_records_.back()); - } - void *get_base_event() const { - return base_event_; - } + const std::string &kernel_name) override; + void update_timeline(std::vector &traced_records) override; }; } // namespace taichi::lang From d367b02dae10c868c1a060eb2601e009da9b1334 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 8 Feb 2023 14:18:31 +0800 Subject: [PATCH 02/15] add vritualenv deconstructor --- taichi/program/kernel_profiler.h | 5 ++++- taichi/rhi/cuda/cuda_profiler.cpp | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h index 080c1e00b7892..e0e38877b2085 100644 --- a/taichi/program/kernel_profiler.h +++ b/taichi/program/kernel_profiler.h @@ -117,7 +117,7 @@ class EventToolkitBase { TI_NOT_IMPLEMENTED; }; - private: + protected: struct EventRecord { std::string name; float kernel_elapsed_time_in_ms{0.0}; @@ -140,6 +140,9 @@ class EventToolkitBase { void *get_base_event() const { return base_event_; } + virtual ~EventToolkitBase() { + + }; }; std::unique_ptr make_profiler(Arch arch, bool enable); diff --git a/taichi/rhi/cuda/cuda_profiler.cpp b/taichi/rhi/cuda/cuda_profiler.cpp index 392aaa5b1f35b..7238466569ac8 100644 --- a/taichi/rhi/cuda/cuda_profiler.cpp +++ b/taichi/rhi/cuda/cuda_profiler.cpp @@ -13,7 +13,7 @@ KernelProfilerCUDA::KernelProfilerCUDA(bool enable) { metric_list_.clear(); if (enable) { // default profiling toolkit: event tool_ = ProfilingToolkit::event; - event_toolkit_ = std::make_unique(); + event_toolkit_ = std::make_unique(); } } From 29b3cea914fbd0e2f14d3a29efca1baf10c7ab4e Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 8 Feb 2023 16:01:12 +0800 Subject: [PATCH 03/15] add profiler framework --- taichi/rhi/amdgpu/amdgpu_profiler.cpp | 129 ++++++++++++++++++++++++++ taichi/rhi/amdgpu/amdgpu_profiler.h | 38 +++++++- 2 files changed, 166 insertions(+), 1 deletion(-) diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp index e69de29bb2d1d..7707efbab809d 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.cpp +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -0,0 +1,129 @@ +#include "taichi/rhi/amdgpu/amdgpu_profiler.h" +#include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_context.h" + +namespace taichi::lang { +#if defined(TI_WITH_AMDGPU) + +std::string KernelProfilerAMDGPU::get_device_name() { + return AMDGPUContext::get_instance().get_device_name(); +} + +bool KernelProfilerAMDGPU::reinit_with_metrics( + const std::vector metrics) { + TI_NOT_IMPLEMENTED +} + +KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( + const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; +} + +void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size) { + TI_NOT_IMPLEMENTED; +} + +void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { + TI_NOT_IMPLEMENTED +} + +bool KernelProfilerAMDGPU::statistics_on_traced_records() { + TI_NOT_IMPLEMENTED +} + +void KernelProfilerAMDGPU::sync() { + amdgpuDriver::get_instance().stream_synchronize(nullptr); +} +void KernelProfilerAMDGPU::update() { + TI_NOT_IMPLEMENTED +} + +void KernelProfilerAMDGPU::clear() { + TI_NOT_IMPLEMENTED +} + +#else +std::string KernelProfilerAMDGPU::get_device_name() { + TI_NOT_IMPLEMENTED +} + +bool KernelProfilerAMDGPU::reinit_with_metrics( + const std::vector metrics) { + TI_NOT_IMPLEMENTED +} + +KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( + const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; +} + +void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size) { + TI_NOT_IMPLEMENTED; +} + +void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { + TI_NOT_IMPLEMENTED +} + +bool KernelProfilerAMDGPU::statistics_on_traced_records() { + TI_NOT_IMPLEMENTED +} + +void KernelProfilerAMDGPU::sync() { + TI_NOT_IMPLEMENTED +} +void KernelProfilerAMDGPU::update() { + TI_NOT_IMPLEMENTED +} + +void KernelProfilerAMDGPU::clear() { + TI_NOT_IMPLEMENTED +} + +#endif + +#if defined(TI_WITH_AMDGPU) + +KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle( + const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; +} +void EventToolkitAMDGPU::update_record( + uint32_t records_size_after_sync, + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; +} +void EventToolkitAMDGPU::update_timeline( + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; +} + +#else + +KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle( + const std::string &kernel_name) { + TI_NOT_IMPLEMENTED; +} +void EventToolkitAMDGPU::update_record( + uint32_t records_size_after_sync, + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; +} +void EventToolkitAMDGPU::update_timeline( + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; +} + +#endif + +} \ No newline at end of file diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h index 486e2c8b9dda1..a095f0ded111f 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.h +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -9,5 +9,41 @@ #include namespace taichi::lang { - + class EventToolkitAMDGPU; + + class KernelProfilerAMDGPU : public KernelProfilerBase { + public: + std::string get_device_name() override; + + bool reinit_with_metrics(const std::vector metrics) override; + void trace(KernelProfilerBase::TaskHandle &task_handle, + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size); + void sync() override; + void update() override; + void clear() override; + void stop(KernelProfilerBase::TaskHandle handle) override; + + bool set_profiler_toolkit(std::string toolkit_name) override; + + bool statistics_on_traced_records(); + + KernelProfilerBase::TaskHandle start_with_handle( + const std::string &kernel_name) override; + + private: + std::unique_ptr event_toolkit_{nullptr}; + }; + + class EventToolkitAMDGPU : public EventToolkitBase { + public: + void update_record(uint32_t records_size_after_sync, + std::vector &traced_records) override; + KernelProfilerBase::TaskHandle start_with_handle( + const std::string &kernel_name) override; + void update_timeline(std::vector &traced_records) override; + }; } From 2ec9f84e05c2ee2c87711d69601ff10b3057a8be Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 8 Feb 2023 16:05:31 +0800 Subject: [PATCH 04/15] fix typo and add cmake --- taichi/rhi/amdgpu/CMakeLists.txt | 1 + taichi/rhi/amdgpu/amdgpu_profiler.cpp | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/taichi/rhi/amdgpu/CMakeLists.txt b/taichi/rhi/amdgpu/CMakeLists.txt index 8c6e42417bb13..1bd646bb3b5d2 100644 --- a/taichi/rhi/amdgpu/CMakeLists.txt +++ b/taichi/rhi/amdgpu/CMakeLists.txt @@ -8,6 +8,7 @@ target_sources(${AMDGPU_RHI} amdgpu_caching_allocator.cpp amdgpu_context.cpp amdgpu_driver.cpp + amdgpu_profiler.cpp ) target_include_directories(${AMDGPU_RHI} diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp index 7707efbab809d..14e9877f1c86f 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.cpp +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -11,7 +11,7 @@ std::string KernelProfilerAMDGPU::get_device_name() { bool KernelProfilerAMDGPU::reinit_with_metrics( const std::vector metrics) { - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED } KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( @@ -37,7 +37,7 @@ bool KernelProfilerAMDGPU::statistics_on_traced_records() { } void KernelProfilerAMDGPU::sync() { - amdgpuDriver::get_instance().stream_synchronize(nullptr); + AMDGPUDriver::get_instance().stream_synchronize(nullptr); } void KernelProfilerAMDGPU::update() { TI_NOT_IMPLEMENTED From 40b0198be9ac59ba28b8369db513f502cf58e177 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Thu, 9 Feb 2023 13:18:52 +0800 Subject: [PATCH 05/15] add amdgpu_profiler context --- taichi/rhi/amdgpu/amdgpu_profiler.cpp | 140 ++++++++++++++++++++++++-- taichi/rhi/amdgpu/amdgpu_types.h | 15 +++ taichi/rhi/cuda/cuda_profiler.cpp | 9 +- 3 files changed, 152 insertions(+), 12 deletions(-) create mode 100644 taichi/rhi/amdgpu/amdgpu_types.h diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp index 14e9877f1c86f..dee11647203fc 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.cpp +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -1,6 +1,7 @@ #include "taichi/rhi/amdgpu/amdgpu_profiler.h" #include "taichi/rhi/amdgpu/amdgpu_driver.h" #include "taichi/rhi/amdgpu/amdgpu_context.h" +#include "taichi/rhi/amdgpu/amdgpu_types.h" namespace taichi::lang { #if defined(TI_WITH_AMDGPU) @@ -14,6 +15,13 @@ bool KernelProfilerAMDGPU::reinit_with_metrics( TI_NOT_IMPLEMENTED } +bool set_profiler_toolkit(std::string toolkit_name) override { + if (toolkit_name.compare("default") == 0) { + return true; + } + TI_WARN("Only default(event) profiler is allowed on AMDGPU"); +} + KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( const std::string &kernel_name) { TI_NOT_IMPLEMENTED; @@ -25,26 +33,84 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, uint32_t grid_size, uint32_t block_size, uint32_t dynamic_smem_size) { - TI_NOT_IMPLEMENTED; + int register_per_thread = 0; + int static_shared_mem_per_block = 0; + int max_active_blocks_per_multiprocessor = 0; + task_handle = event_toolkit_->start_with_handle(kernel_name); + KernelProfileTracedRecord record; + + AMDGPUDriver::get_instance().kernel_get_attribute( + ®ister_per_thread, HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_NUM_REGS, + kernel); + AMDGPUDriver::get_instance().kernel_get_attribute( + &static_shared_mem_per_block, + HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); + AMDGPUDriver::get_instance().kernel_get_occupancy( + &max_active_blocks_per_multiprocessor, kernel, block_size, + dynamic_smem_size); + + record.name = kernel_name; + record.register_per_thread = register_per_thread; + record.shared_mem_per_block = static_shared_mem_per_block + dynamic_smem_size; + record.grid_size = grid_size; + record.block_size = block_size; + record.active_blocks_per_multiprocessor = + max_active_blocks_per_multiprocessor; + + traced_records_.push_back(record); } void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { - TI_NOT_IMPLEMENTED + AMDGPUDriver::get_instance().event_record(handle, 0); + AMDGPUDriver::get_instance().stream_synchronize(nullptr); + + // get elapsed time and destroy events + auto record = event_toolkit_->get_current_event_record(); + AMDGPUDriver::get_instance().event_elapsed_time( + &record->kernel_elapsed_time_in_ms, record->start_event, handle); + AMDGPUDriver::get_instance().event_elapsed_time( + &record->time_since_base, event_toolkit_->get_base_event(), + record->start_event); + + AMDGPUDriver::get_instance().event_destroy(record->start_event); + AMDGPUDriver::get_instance().event_destroy(record->stop_event); } bool KernelProfilerAMDGPU::statistics_on_traced_records() { - TI_NOT_IMPLEMENTED + for (auto &record : traced_records_) { + auto it = + std::find_if(statistical_results_.begin(), statistical_results_.end(), + [&](KernelProfileStatisticalResult &result) { + return result.name == record.name; + }); + if (it == statistical_results_.end()) { + statistical_results_.emplace_back(record.name); + it = std::prev(statistical_results_.end()); + } + it->insert_record(record.kernel_elapsed_time_in_ms); + total_time_ms_ += record.kernel_elapsed_time_in_ms; + } + + return true; } void KernelProfilerAMDGPU::sync() { AMDGPUDriver::get_instance().stream_synchronize(nullptr); } void KernelProfilerAMDGPU::update() { - TI_NOT_IMPLEMENTED + event_toolkit_->update_record(records_size_after_sync_, traced_records_); + event_toolkit_->update_timeline(traced_records_); + statistics_on_traced_records(); + event_toolkit_->clear(); + records_size_after_sync_ = traced_records_.size(); } void KernelProfilerAMDGPU::clear() { - TI_NOT_IMPLEMENTED + update(); + total_time_ms_ = 0; + records_size_after_sync_ = 0; + traced_records_.clear(); + statistical_results_.clear(); } #else @@ -96,16 +162,74 @@ void KernelProfilerAMDGPU::clear() { KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle( const std::string &kernel_name) { - TI_NOT_IMPLEMENTED; + EventRecord record; + record.name = kernel_name; + + AMDGPUDriver::get_instance().event_create(&(record.start_event), + HIP_EVENT_DEFAULT); + AMDGPUDriver::get_instance().event_create(&(record.stop_event), + HIP_EVENT_DEFAULT); + AMDGPUDriver::get_instance().event_record((record.start_event), 0); + event_records_.push_back(record); + + if (!base_event_) { + int n_iters = 100; + // Warm up + for (int i = 0; i < n_iters; i++) { + void *e; + AMDGPUDriver::get_instance().event_create(&e, HIP_EVENT_DEFAULT); + AMDGPUDriver::get_instance().event_record(e, 0); + AMDGPUDriver::get_instance().event_synchronize(e); + auto final_t = Time::get_time(); + if (i == n_iters - 1) { + base_event_ = e; + // ignore the overhead of sync, event_create and systematic time offset. + base_time_ = final_t; + } else { + AMDGPUDriver::get_instance().event_destroy(e); + } + } + } + return record.stop_event; } + void EventToolkitAMDGPU::update_record( uint32_t records_size_after_sync, std::vector &traced_records) { - TI_NOT_IMPLEMENTED; + uint32_t events_num = event_records_.size(); + uint32_t records_num = traced_records.size(); + TI_ERROR_IF( + records_size_after_sync + events_num != records_num, + "KernelProfilerAMDGPU::EventToolkitAMDGPU: event_records_.size({}) != " + "traced_records_.size({})", + records_size_after_sync + events_num, records_num); + + uint32_t idx = 0; + for (auto &record : event_records_) { + // copy to traced_records_ then clear event_records_ + traced_records[records_size_after_sync + idx].kernel_elapsed_time_in_ms = + record.kernel_elapsed_time_in_ms; + traced_records[records_size_after_sync + idx].time_since_base = + record.time_since_base; + idx++; + } } + void EventToolkitAMDGPU::update_timeline( std::vector &traced_records) { - TI_NOT_IMPLEMENTED; + if (Timelines::get_instance().get_enabled()) { + auto &timeline = Timeline::get_this_thread_instance(); + for (auto &record : traced_records) { + timeline.insert_event({record.name, /*param_name=begin*/ true, + base_time_ + record.time_since_base * 1e-3, + "amdgpu"}); + timeline.insert_event({record.name, /*param_name=begin*/ false, + base_time_ + (record.time_since_base + + record.kernel_elapsed_time_in_ms) * + 1e-3, + "amdgpu"}); + } + } } #else diff --git a/taichi/rhi/amdgpu/amdgpu_types.h b/taichi/rhi/amdgpu/amdgpu_types.h new file mode 100644 index 0000000000000..9a5a59f4cadd9 --- /dev/null +++ b/taichi/rhi/amdgpu/amdgpu_types.h @@ -0,0 +1,15 @@ +#pragma once + +typedef enum HIPfunction_attribute_enum { + HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 0, + HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES = 1, + HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES = 2, + HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES = 3, + HIP_FUNC_ATTRIBUTE_NUM_REGS = 4, + HIP_FUNC_ATTRIBUTE_PTX_VERSION = 5, + HIP_FUNC_ATTRIBUTE_BINARY_VERSION = 6, + HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA = 7, + HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, + HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, + HIP_FUNC_ATTRIBUTE_MAX +} HIPfunction_attribute; \ No newline at end of file diff --git a/taichi/rhi/cuda/cuda_profiler.cpp b/taichi/rhi/cuda/cuda_profiler.cpp index 7238466569ac8..53e4b4180178c 100644 --- a/taichi/rhi/cuda/cuda_profiler.cpp +++ b/taichi/rhi/cuda/cuda_profiler.cpp @@ -318,10 +318,11 @@ void EventToolkitCUDA::update_record( std::vector &traced_records) { uint32_t events_num = event_records_.size(); uint32_t records_num = traced_records.size(); - TI_ERROR_IF(records_size_after_sync + events_num != records_num, - "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " - "traced_records_.size({})", - records_size_after_sync + events_num, records_num); + TI_ERROR_IF( + records_size_after_sync + events_num != records_num, + "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " + "traced_records_.size({})", + records_size_after_sync + events_num, records_num); uint32_t idx = 0; for (auto &record : event_records_) { From 404d8356f183800f0c73c0c13f54f1392c15bec0 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Thu, 9 Feb 2023 13:21:48 +0800 Subject: [PATCH 06/15] fix bug --- taichi/rhi/amdgpu/amdgpu_profiler.cpp | 4 +++- taichi/rhi/amdgpu/amdgpu_profiler.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp index dee11647203fc..7c4170084071e 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.cpp +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -15,11 +15,12 @@ bool KernelProfilerAMDGPU::reinit_with_metrics( TI_NOT_IMPLEMENTED } -bool set_profiler_toolkit(std::string toolkit_name) override { +bool set_profiler_toolkit(std::string toolkit_name) { if (toolkit_name.compare("default") == 0) { return true; } TI_WARN("Only default(event) profiler is allowed on AMDGPU"); + return false; } KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( @@ -97,6 +98,7 @@ bool KernelProfilerAMDGPU::statistics_on_traced_records() { void KernelProfilerAMDGPU::sync() { AMDGPUDriver::get_instance().stream_synchronize(nullptr); } + void KernelProfilerAMDGPU::update() { event_toolkit_->update_record(records_size_after_sync_, traced_records_); event_toolkit_->update_timeline(traced_records_); diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h index a095f0ded111f..0faf1c240ddab 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.h +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -36,6 +36,7 @@ namespace taichi::lang { private: std::unique_ptr event_toolkit_{nullptr}; + uint32_t records_size_after_sync_{0}; }; class EventToolkitAMDGPU : public EventToolkitBase { From 62f09ad88e73384147b30d063ddc6f1d003f4f5f Mon Sep 17 00:00:00 2001 From: zeyuli Date: Thu, 9 Feb 2023 13:38:00 +0800 Subject: [PATCH 07/15] add profiler usage during launch --- taichi/program/kernel_profiler.cpp | 8 ++++++++ taichi/rhi/amdgpu/amdgpu_context.cpp | 17 +++++++++++++++++ taichi/rhi/amdgpu/amdgpu_context.h | 5 +++++ taichi/rhi/amdgpu/amdgpu_profiler.cpp | 2 +- taichi/runtime/llvm/llvm_runtime_executor.cpp | 5 +++++ 5 files changed, 36 insertions(+), 1 deletion(-) diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp index aecc0f36b8f54..cd1f6fbb1f70b 100644 --- a/taichi/program/kernel_profiler.cpp +++ b/taichi/program/kernel_profiler.cpp @@ -5,6 +5,8 @@ #include "taichi/rhi/cuda/cuda_profiler.h" #include "taichi/system/timeline.h" +#include "taichi/rhi/amdgpu/amdgpu_profiler.h" + namespace taichi::lang { void KernelProfileStatisticalResult::insert_record(double t) { @@ -124,6 +126,12 @@ std::unique_ptr make_profiler(Arch arch, bool enable) { return std::make_unique(enable); #else TI_NOT_IMPLEMENTED; +#endif + } else if (arch == Arch::amdgpu) { +#if defined(TI_WITH_AMDGPU) + return std::make_unique(); +#else + TI_NOT_IMPLEMENTED #endif } else { return std::make_unique(); diff --git a/taichi/rhi/amdgpu/amdgpu_context.cpp b/taichi/rhi/amdgpu/amdgpu_context.cpp index e85bdef9ae90d..d4ca43e8e782c 100644 --- a/taichi/rhi/amdgpu/amdgpu_context.cpp +++ b/taichi/rhi/amdgpu/amdgpu_context.cpp @@ -8,7 +8,9 @@ #include "taichi/program/program.h" #include "taichi/system/threading.h" #include "taichi/rhi/amdgpu/amdgpu_driver.h" +#include "taichi/rhi/amdgpu/amdgpu_profiler.h" #include "taichi/analysis/offline_cache_util.h" +#include "taichi/util/offline_cache.h" namespace taichi { namespace lang { @@ -120,6 +122,17 @@ void AMDGPUContext::launch(void *func, unsigned grid_dim, unsigned block_dim, std::size_t dynamic_shared_mem_bytes) { + KernelProfilerBase::TaskHandle task_handle; + // Kernel launch + if (profiler_) { + KernelProfilerAMDGPU *profiler_amdgpu = + dynamic_cast(profiler_); + std::string primal_task_name, key; + bool valid = + offline_cache::try_demangle_name(task_name, primal_task_name, key); + profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name, + func, grid_dim, block_dim, 0); + } auto pack_size = get_args_byte(arg_sizes); char *packed_arg = (char *)std::malloc(pack_size); pack_args(arg_pointers, arg_sizes, packed_arg); @@ -132,6 +145,10 @@ void AMDGPUContext::launch(void *func, reinterpret_cast(&config)); } std::free(packed_arg); + + if (profiler_) + profiler_->stop(task_handle); + if (debug_) { driver_.stream_synchronize(nullptr); } diff --git a/taichi/rhi/amdgpu/amdgpu_context.h b/taichi/rhi/amdgpu/amdgpu_context.h index 6c0b3048824f1..149623cb5b97b 100644 --- a/taichi/rhi/amdgpu/amdgpu_context.h +++ b/taichi/rhi/amdgpu/amdgpu_context.h @@ -20,6 +20,7 @@ class AMDGPUContext { int compute_capability_; std::string mcpu_; std::mutex lock_; + KernelProfilerBase *profiler_; AMDGPUDriver &driver_; bool debug_; @@ -40,6 +41,10 @@ class AMDGPUContext { int get_args_byte(std::vector arg_sizes); + void set_profiler(KernelProfilerBase *profiler) { + profiler_ = profiler; + } + void launch(void *func, const std::string &task_name, const std::vector &arg_pointers, diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp index 7c4170084071e..11463d940a84a 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.cpp +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -15,7 +15,7 @@ bool KernelProfilerAMDGPU::reinit_with_metrics( TI_NOT_IMPLEMENTED } -bool set_profiler_toolkit(std::string toolkit_name) { +bool KernelProfilerAMDGPU::set_profiler_toolkit(std::string toolkit_name) { if (toolkit_name.compare("default") == 0) { return true; } diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp index 644c329c7eb7a..2b41287c9b64c 100644 --- a/taichi/runtime/llvm/llvm_runtime_executor.cpp +++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp @@ -140,6 +140,11 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, if (config.saturating_grid_dim == 0) { config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2; } + if (config.kernel_profiler) { + AMDGPUContext::get_instance().set_profiler(profiler); + } else { + AMDGPUContext::get_instance().set_profiler(nullptr); + } AMDGPUContext::get_instance().set_debug(config.debug); device_ = std::make_shared(); } From b4c807a1a301ee6a87ea8c1eb0da2cb3d3214af8 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Thu, 9 Feb 2023 14:21:19 +0800 Subject: [PATCH 08/15] debug profiler --- taichi/rhi/amdgpu/amdgpu_profiler.cpp | 13 +++++++------ taichi/rhi/amdgpu/amdgpu_profiler.h | 3 +++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp index 11463d940a84a..fef87218d0953 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.cpp +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -36,7 +36,7 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, uint32_t dynamic_smem_size) { int register_per_thread = 0; int static_shared_mem_per_block = 0; - int max_active_blocks_per_multiprocessor = 0; + // int max_active_blocks_per_multiprocessor = 0; task_handle = event_toolkit_->start_with_handle(kernel_name); KernelProfileTracedRecord record; @@ -46,17 +46,18 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, AMDGPUDriver::get_instance().kernel_get_attribute( &static_shared_mem_per_block, HIPfunction_attribute::HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, kernel); - AMDGPUDriver::get_instance().kernel_get_occupancy( - &max_active_blocks_per_multiprocessor, kernel, block_size, - dynamic_smem_size); + // kernel_get_occupancy doesn't work well + // AMDGPUDriver::get_instance().kernel_get_occupancy( + // &max_active_blocks_per_multiprocessor, kernel, block_size, + // dynamic_smem_size); record.name = kernel_name; record.register_per_thread = register_per_thread; record.shared_mem_per_block = static_shared_mem_per_block + dynamic_smem_size; record.grid_size = grid_size; record.block_size = block_size; - record.active_blocks_per_multiprocessor = - max_active_blocks_per_multiprocessor; + // record.active_blocks_per_multiprocessor = + // max_active_blocks_per_multiprocessor; traced_records_.push_back(record); } diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h index 0faf1c240ddab..c1c7abda2f727 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.h +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -13,6 +13,9 @@ namespace taichi::lang { class KernelProfilerAMDGPU : public KernelProfilerBase { public: + KernelProfilerAMDGPU() { + event_toolkit_ = std::make_unique(); + } std::string get_device_name() override; bool reinit_with_metrics(const std::vector metrics) override; From fa6a3f72985474d18b361ab56f952215706fc82d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 9 Feb 2023 06:27:24 +0000 Subject: [PATCH 09/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- taichi/program/kernel_profiler.h | 18 +++--- taichi/rhi/amdgpu/amdgpu_context.cpp | 2 +- taichi/rhi/amdgpu/amdgpu_profiler.cpp | 88 +++++++++++++-------------- taichi/rhi/amdgpu/amdgpu_profiler.h | 86 +++++++++++++------------- taichi/rhi/amdgpu/amdgpu_types.h | 2 +- taichi/rhi/cuda/cuda_profiler.h | 8 ++- 6 files changed, 103 insertions(+), 101 deletions(-) diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h index e0e38877b2085..d481364a5b3df 100644 --- a/taichi/program/kernel_profiler.h +++ b/taichi/program/kernel_profiler.h @@ -105,15 +105,17 @@ class KernelProfilerBase { class EventToolkitBase { public: - virtual void update_record(uint32_t records_size_after_sync, - std::vector &traced_records) { - TI_NOT_IMPLEMENTED; - }; + virtual void update_record( + uint32_t records_size_after_sync, + std::vector &traced_records) { + TI_NOT_IMPLEMENTED; + }; virtual KernelProfilerBase::TaskHandle start_with_handle( const std::string &kernel_name) { - TI_NOT_IMPLEMENTED; - }; - virtual void update_timeline(std::vector &traced_records) { + TI_NOT_IMPLEMENTED; + }; + virtual void update_timeline( + std::vector &traced_records) { TI_NOT_IMPLEMENTED; }; @@ -140,7 +142,7 @@ class EventToolkitBase { void *get_base_event() const { return base_event_; } - virtual ~EventToolkitBase() { + virtual ~EventToolkitBase(){ }; }; diff --git a/taichi/rhi/amdgpu/amdgpu_context.cpp b/taichi/rhi/amdgpu/amdgpu_context.cpp index d4ca43e8e782c..957d83d8ef024 100644 --- a/taichi/rhi/amdgpu/amdgpu_context.cpp +++ b/taichi/rhi/amdgpu/amdgpu_context.cpp @@ -131,7 +131,7 @@ void AMDGPUContext::launch(void *func, bool valid = offline_cache::try_demangle_name(task_name, primal_task_name, key); profiler_amdgpu->trace(task_handle, valid ? primal_task_name : task_name, - func, grid_dim, block_dim, 0); + func, grid_dim, block_dim, 0); } auto pack_size = get_args_byte(arg_sizes); char *packed_arg = (char *)std::malloc(pack_size); diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.cpp b/taichi/rhi/amdgpu/amdgpu_profiler.cpp index fef87218d0953..ed9a4b8573a0c 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.cpp +++ b/taichi/rhi/amdgpu/amdgpu_profiler.cpp @@ -12,7 +12,7 @@ std::string KernelProfilerAMDGPU::get_device_name() { bool KernelProfilerAMDGPU::reinit_with_metrics( const std::vector metrics) { - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED } bool KernelProfilerAMDGPU::set_profiler_toolkit(std::string toolkit_name) { @@ -29,11 +29,11 @@ KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( } void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, - const std::string &kernel_name, - void *kernel, - uint32_t grid_size, - uint32_t block_size, - uint32_t dynamic_smem_size) { + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size) { int register_per_thread = 0; int static_shared_mem_per_block = 0; // int max_active_blocks_per_multiprocessor = 0; @@ -63,19 +63,19 @@ void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, } void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { - AMDGPUDriver::get_instance().event_record(handle, 0); - AMDGPUDriver::get_instance().stream_synchronize(nullptr); - - // get elapsed time and destroy events - auto record = event_toolkit_->get_current_event_record(); - AMDGPUDriver::get_instance().event_elapsed_time( - &record->kernel_elapsed_time_in_ms, record->start_event, handle); - AMDGPUDriver::get_instance().event_elapsed_time( - &record->time_since_base, event_toolkit_->get_base_event(), - record->start_event); - - AMDGPUDriver::get_instance().event_destroy(record->start_event); - AMDGPUDriver::get_instance().event_destroy(record->stop_event); + AMDGPUDriver::get_instance().event_record(handle, 0); + AMDGPUDriver::get_instance().stream_synchronize(nullptr); + + // get elapsed time and destroy events + auto record = event_toolkit_->get_current_event_record(); + AMDGPUDriver::get_instance().event_elapsed_time( + &record->kernel_elapsed_time_in_ms, record->start_event, handle); + AMDGPUDriver::get_instance().event_elapsed_time( + &record->time_since_base, event_toolkit_->get_base_event(), + record->start_event); + + AMDGPUDriver::get_instance().event_destroy(record->start_event); + AMDGPUDriver::get_instance().event_destroy(record->stop_event); } bool KernelProfilerAMDGPU::statistics_on_traced_records() { @@ -97,13 +97,13 @@ bool KernelProfilerAMDGPU::statistics_on_traced_records() { } void KernelProfilerAMDGPU::sync() { - AMDGPUDriver::get_instance().stream_synchronize(nullptr); + AMDGPUDriver::get_instance().stream_synchronize(nullptr); } void KernelProfilerAMDGPU::update() { event_toolkit_->update_record(records_size_after_sync_, traced_records_); event_toolkit_->update_timeline(traced_records_); - statistics_on_traced_records(); + statistics_on_traced_records(); event_toolkit_->clear(); records_size_after_sync_ = traced_records_.size(); } @@ -118,46 +118,42 @@ void KernelProfilerAMDGPU::clear() { #else std::string KernelProfilerAMDGPU::get_device_name() { - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED } bool KernelProfilerAMDGPU::reinit_with_metrics( - const std::vector metrics) { - TI_NOT_IMPLEMENTED -} + const std::vector metrics){TI_NOT_IMPLEMENTED} -KernelProfilerBase::TaskHandle KernelProfilerAMDGPU::start_with_handle( - const std::string &kernel_name) { +KernelProfilerBase::TaskHandle + KernelProfilerAMDGPU::start_with_handle(const std::string &kernel_name) { TI_NOT_IMPLEMENTED; } void KernelProfilerAMDGPU::trace(KernelProfilerBase::TaskHandle &task_handle, - const std::string &kernel_name, - void *kernel, - uint32_t grid_size, - uint32_t block_size, - uint32_t dynamic_smem_size) { - TI_NOT_IMPLEMENTED; + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size) { + TI_NOT_IMPLEMENTED; } void KernelProfilerAMDGPU::stop(KernelProfilerBase::TaskHandle handle) { - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED } bool KernelProfilerAMDGPU::statistics_on_traced_records() { - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED } void KernelProfilerAMDGPU::sync() { - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED } void KernelProfilerAMDGPU::update() { - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED } -void KernelProfilerAMDGPU::clear() { - TI_NOT_IMPLEMENTED -} +void KernelProfilerAMDGPU::clear(){TI_NOT_IMPLEMENTED} #endif @@ -169,9 +165,9 @@ KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle( record.name = kernel_name; AMDGPUDriver::get_instance().event_create(&(record.start_event), - HIP_EVENT_DEFAULT); + HIP_EVENT_DEFAULT); AMDGPUDriver::get_instance().event_create(&(record.stop_event), - HIP_EVENT_DEFAULT); + HIP_EVENT_DEFAULT); AMDGPUDriver::get_instance().event_record((record.start_event), 0); event_records_.push_back(record); @@ -220,7 +216,7 @@ void EventToolkitAMDGPU::update_record( void EventToolkitAMDGPU::update_timeline( std::vector &traced_records) { - if (Timelines::get_instance().get_enabled()) { + if (Timelines::get_instance().get_enabled()) { auto &timeline = Timeline::get_this_thread_instance(); for (auto &record : traced_records) { timeline.insert_event({record.name, /*param_name=begin*/ true, @@ -237,8 +233,8 @@ void EventToolkitAMDGPU::update_timeline( #else -KernelProfilerBase::TaskHandle EventToolkitAMDGPU::start_with_handle( - const std::string &kernel_name) { +KernelProfilerBase::TaskHandle + EventToolkitAMDGPU::start_with_handle(const std::string &kernel_name) { TI_NOT_IMPLEMENTED; } void EventToolkitAMDGPU::update_record( @@ -253,4 +249,4 @@ void EventToolkitAMDGPU::update_timeline( #endif -} \ No newline at end of file +} // namespace taichi::lang diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h index c1c7abda2f727..0dc7fbf2ca8e4 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.h +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -9,45 +9,47 @@ #include namespace taichi::lang { - class EventToolkitAMDGPU; - - class KernelProfilerAMDGPU : public KernelProfilerBase { - public: - KernelProfilerAMDGPU() { - event_toolkit_ = std::make_unique(); - } - std::string get_device_name() override; - - bool reinit_with_metrics(const std::vector metrics) override; - void trace(KernelProfilerBase::TaskHandle &task_handle, - const std::string &kernel_name, - void *kernel, - uint32_t grid_size, - uint32_t block_size, - uint32_t dynamic_smem_size); - void sync() override; - void update() override; - void clear() override; - void stop(KernelProfilerBase::TaskHandle handle) override; - - bool set_profiler_toolkit(std::string toolkit_name) override; - - bool statistics_on_traced_records(); - - KernelProfilerBase::TaskHandle start_with_handle( - const std::string &kernel_name) override; - - private: - std::unique_ptr event_toolkit_{nullptr}; - uint32_t records_size_after_sync_{0}; - }; - - class EventToolkitAMDGPU : public EventToolkitBase { - public: - void update_record(uint32_t records_size_after_sync, - std::vector &traced_records) override; - KernelProfilerBase::TaskHandle start_with_handle( - const std::string &kernel_name) override; - void update_timeline(std::vector &traced_records) override; - }; -} +class EventToolkitAMDGPU; + +class KernelProfilerAMDGPU : public KernelProfilerBase { + public: + KernelProfilerAMDGPU() { + event_toolkit_ = std::make_unique(); + } + std::string get_device_name() override; + + bool reinit_with_metrics(const std::vector metrics) override; + void trace(KernelProfilerBase::TaskHandle &task_handle, + const std::string &kernel_name, + void *kernel, + uint32_t grid_size, + uint32_t block_size, + uint32_t dynamic_smem_size); + void sync() override; + void update() override; + void clear() override; + void stop(KernelProfilerBase::TaskHandle handle) override; + + bool set_profiler_toolkit(std::string toolkit_name) override; + + bool statistics_on_traced_records(); + + KernelProfilerBase::TaskHandle start_with_handle( + const std::string &kernel_name) override; + + private: + std::unique_ptr event_toolkit_{nullptr}; + uint32_t records_size_after_sync_{0}; +}; + +class EventToolkitAMDGPU : public EventToolkitBase { + public: + void update_record( + uint32_t records_size_after_sync, + std::vector &traced_records) override; + KernelProfilerBase::TaskHandle start_with_handle( + const std::string &kernel_name) override; + void update_timeline( + std::vector &traced_records) override; +}; +} // namespace taichi::lang diff --git a/taichi/rhi/amdgpu/amdgpu_types.h b/taichi/rhi/amdgpu/amdgpu_types.h index 9a5a59f4cadd9..4d2c795842e42 100644 --- a/taichi/rhi/amdgpu/amdgpu_types.h +++ b/taichi/rhi/amdgpu/amdgpu_types.h @@ -12,4 +12,4 @@ typedef enum HIPfunction_attribute_enum { HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES = 8, HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT = 9, HIP_FUNC_ATTRIBUTE_MAX -} HIPfunction_attribute; \ No newline at end of file +} HIPfunction_attribute; diff --git a/taichi/rhi/cuda/cuda_profiler.h b/taichi/rhi/cuda/cuda_profiler.h index b531ea9c09b69..cbb8c798a3ea4 100644 --- a/taichi/rhi/cuda/cuda_profiler.h +++ b/taichi/rhi/cuda/cuda_profiler.h @@ -64,11 +64,13 @@ class KernelProfilerCUDA : public KernelProfilerBase { // default profiling toolkit class EventToolkitCUDA : public EventToolkitBase { public: - void update_record(uint32_t records_size_after_sync, - std::vector &traced_records) override; + void update_record( + uint32_t records_size_after_sync, + std::vector &traced_records) override; KernelProfilerBase::TaskHandle start_with_handle( const std::string &kernel_name) override; - void update_timeline(std::vector &traced_records) override; + void update_timeline( + std::vector &traced_records) override; }; } // namespace taichi::lang From 4d94f40c4e0c0c557c862f6ae5fb974e23b5748f Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 15 Feb 2023 12:00:51 +0800 Subject: [PATCH 10/15] revert cuda profiler --- taichi/rhi/cuda/cuda_profiler.cpp | 9 ++++---- taichi/rhi/cuda/cuda_profiler.h | 38 ++++++++++++++++++++++++------- 2 files changed, 34 insertions(+), 13 deletions(-) diff --git a/taichi/rhi/cuda/cuda_profiler.cpp b/taichi/rhi/cuda/cuda_profiler.cpp index 53e4b4180178c..7238466569ac8 100644 --- a/taichi/rhi/cuda/cuda_profiler.cpp +++ b/taichi/rhi/cuda/cuda_profiler.cpp @@ -318,11 +318,10 @@ void EventToolkitCUDA::update_record( std::vector &traced_records) { uint32_t events_num = event_records_.size(); uint32_t records_num = traced_records.size(); - TI_ERROR_IF( - records_size_after_sync + events_num != records_num, - "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " - "traced_records_.size({})", - records_size_after_sync + events_num, records_num); + TI_ERROR_IF(records_size_after_sync + events_num != records_num, + "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " + "traced_records_.size({})", + records_size_after_sync + events_num, records_num); uint32_t idx = 0; for (auto &record : event_records_) { diff --git a/taichi/rhi/cuda/cuda_profiler.h b/taichi/rhi/cuda/cuda_profiler.h index cbb8c798a3ea4..9b3765abf6403 100644 --- a/taichi/rhi/cuda/cuda_profiler.h +++ b/taichi/rhi/cuda/cuda_profiler.h @@ -62,15 +62,37 @@ class KernelProfilerCUDA : public KernelProfilerBase { }; // default profiling toolkit -class EventToolkitCUDA : public EventToolkitBase { +class EventToolkit { public: - void update_record( - uint32_t records_size_after_sync, - std::vector &traced_records) override; + void update_record(uint32_t records_size_after_sync, + std::vector &traced_records); KernelProfilerBase::TaskHandle start_with_handle( - const std::string &kernel_name) override; - void update_timeline( - std::vector &traced_records) override; + const std::string &kernel_name); + void update_timeline(std::vector &traced_records); + void clear() { + event_records_.clear(); + } + + private: + struct EventRecord { + std::string name; + float kernel_elapsed_time_in_ms{0.0}; + float time_since_base{0.0}; + void *start_event{nullptr}; + void *stop_event{nullptr}; + }; + float64 base_time_{0.0}; + void *base_event_{nullptr}; + // for cuEvent profiling, clear after sync() + std::vector event_records_; + + public: + EventRecord *get_current_event_record() { + return &(event_records_.back()); + } + void *get_base_event() const { + return base_event_; + } }; -} // namespace taichi::lang +} // namespace taichi::lang \ No newline at end of file From 09a73ad8608e25024446f8134b97345a09987791 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 15 Feb 2023 12:03:53 +0800 Subject: [PATCH 11/15] revert kernel_profiler.h --- taichi/program/kernel_profiler.h | 44 -------------------------------- 1 file changed, 44 deletions(-) diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h index d481364a5b3df..a2e18924c2b64 100644 --- a/taichi/program/kernel_profiler.h +++ b/taichi/program/kernel_profiler.h @@ -103,50 +103,6 @@ class KernelProfilerBase { } }; -class EventToolkitBase { - public: - virtual void update_record( - uint32_t records_size_after_sync, - std::vector &traced_records) { - TI_NOT_IMPLEMENTED; - }; - virtual KernelProfilerBase::TaskHandle start_with_handle( - const std::string &kernel_name) { - TI_NOT_IMPLEMENTED; - }; - virtual void update_timeline( - std::vector &traced_records) { - TI_NOT_IMPLEMENTED; - }; - - protected: - struct EventRecord { - std::string name; - float kernel_elapsed_time_in_ms{0.0}; - float time_since_base{0.0}; - void *start_event{nullptr}; - void *stop_event{nullptr}; - }; - float64 base_time_{0.0}; - void *base_event_{nullptr}; - // for cuEvent profiling, clear after sync() - std::vector event_records_; - - public: - void clear() { - event_records_.clear(); - } - EventRecord *get_current_event_record() { - return &(event_records_.back()); - } - void *get_base_event() const { - return base_event_; - } - virtual ~EventToolkitBase(){ - - }; -}; - std::unique_ptr make_profiler(Arch arch, bool enable); } // namespace taichi::lang From 4b2d8903eada4fc4648ea408e7fce88e0d6bf9cb Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 15 Feb 2023 13:16:14 +0800 Subject: [PATCH 12/15] remove BASE class --- taichi/rhi/amdgpu/amdgpu_profiler.h | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h index 0dc7fbf2ca8e4..c966b4ff062e4 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.h +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -42,7 +42,7 @@ class KernelProfilerAMDGPU : public KernelProfilerBase { uint32_t records_size_after_sync_{0}; }; -class EventToolkitAMDGPU : public EventToolkitBase { +class EventToolkitAMDGPU { public: void update_record( uint32_t records_size_after_sync, @@ -51,5 +51,29 @@ class EventToolkitAMDGPU : public EventToolkitBase { const std::string &kernel_name) override; void update_timeline( std::vector &traced_records) override; + void clear() { + event_records_.clear(); + } + + private: + struct EventRecord { + std::string name; + float kernel_elapsed_time_in_ms{0.0}; + float time_since_base{0.0}; + void *start_event{nullptr}; + void *stop_event{nullptr}; + }; + float64 base_time_{0.0}; + void *base_event_{nullptr}; + // for cuEvent profiling, clear after sync() + std::vector event_records_; + + public: + EventRecord *get_current_event_record() { + return &(event_records_.back()); + } + void *get_base_event() const { + return base_event_; + } }; } // namespace taichi::lang From 04656baab71b3b24d94504f9d8a5c5dfc21ded5b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Feb 2023 05:32:47 +0000 Subject: [PATCH 13/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- taichi/rhi/cuda/cuda_profiler.cpp | 9 +++++---- taichi/rhi/cuda/cuda_profiler.h | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/taichi/rhi/cuda/cuda_profiler.cpp b/taichi/rhi/cuda/cuda_profiler.cpp index 7238466569ac8..53e4b4180178c 100644 --- a/taichi/rhi/cuda/cuda_profiler.cpp +++ b/taichi/rhi/cuda/cuda_profiler.cpp @@ -318,10 +318,11 @@ void EventToolkitCUDA::update_record( std::vector &traced_records) { uint32_t events_num = event_records_.size(); uint32_t records_num = traced_records.size(); - TI_ERROR_IF(records_size_after_sync + events_num != records_num, - "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " - "traced_records_.size({})", - records_size_after_sync + events_num, records_num); + TI_ERROR_IF( + records_size_after_sync + events_num != records_num, + "KernelProfilerCUDA::EventToolkitCUDA: event_records_.size({}) != " + "traced_records_.size({})", + records_size_after_sync + events_num, records_num); uint32_t idx = 0; for (auto &record : event_records_) { diff --git a/taichi/rhi/cuda/cuda_profiler.h b/taichi/rhi/cuda/cuda_profiler.h index 9b3765abf6403..0d0e3a4ded4be 100644 --- a/taichi/rhi/cuda/cuda_profiler.h +++ b/taichi/rhi/cuda/cuda_profiler.h @@ -95,4 +95,4 @@ class EventToolkit { } }; -} // namespace taichi::lang \ No newline at end of file +} // namespace taichi::lang From 77b1704c56aa9085ea0aca13d61cfc928709d7e8 Mon Sep 17 00:00:00 2001 From: zeyuli Date: Wed, 15 Feb 2023 13:33:16 +0800 Subject: [PATCH 14/15] revert EventToolkitCUDA --- taichi/rhi/amdgpu/amdgpu_profiler.h | 6 +++--- taichi/rhi/cuda/cuda_profiler.cpp | 14 +++++++------- taichi/rhi/cuda/cuda_profiler.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h index c966b4ff062e4..4689aaf03b856 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.h +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -46,11 +46,11 @@ class EventToolkitAMDGPU { public: void update_record( uint32_t records_size_after_sync, - std::vector &traced_records) override; + std::vector &traced_records); KernelProfilerBase::TaskHandle start_with_handle( - const std::string &kernel_name) override; + const std::string &kernel_name); void update_timeline( - std::vector &traced_records) override; + std::vector &traced_records); void clear() { event_records_.clear(); } diff --git a/taichi/rhi/cuda/cuda_profiler.cpp b/taichi/rhi/cuda/cuda_profiler.cpp index 7238466569ac8..2b52fe2592887 100644 --- a/taichi/rhi/cuda/cuda_profiler.cpp +++ b/taichi/rhi/cuda/cuda_profiler.cpp @@ -13,7 +13,7 @@ KernelProfilerCUDA::KernelProfilerCUDA(bool enable) { metric_list_.clear(); if (enable) { // default profiling toolkit: event tool_ = ProfilingToolkit::event; - event_toolkit_ = std::make_unique(); + event_toolkit_ = std::make_unique(); } } @@ -271,7 +271,7 @@ bool KernelProfilerCUDA::record_kernel_attributes(void *kernel, // default profiling toolkit : cuEvent // for now put it together with KernelProfilerCUDA #if defined(TI_WITH_CUDA) -KernelProfilerBase::TaskHandle EventToolkitCUDA::start_with_handle( +KernelProfilerBase::TaskHandle EventToolkit::start_with_handle( const std::string &kernel_name) { EventRecord record; record.name = kernel_name; @@ -313,7 +313,7 @@ KernelProfilerBase::TaskHandle EventToolkitCUDA::start_with_handle( return record.stop_event; } -void EventToolkitCUDA::update_record( +void EventToolkit::update_record( uint32_t records_size_after_sync, std::vector &traced_records) { uint32_t events_num = event_records_.size(); @@ -334,7 +334,7 @@ void EventToolkitCUDA::update_record( } } -void EventToolkitCUDA::update_timeline( +void EventToolkit::update_timeline( std::vector &traced_records) { if (Timelines::get_instance().get_enabled()) { auto &timeline = Timeline::get_this_thread_instance(); @@ -354,16 +354,16 @@ void EventToolkitCUDA::update_timeline( } #else -KernelProfilerBase::TaskHandle EventToolkitCUDA::start_with_handle( +KernelProfilerBase::TaskHandle EventToolkit::start_with_handle( const std::string &kernel_name) { TI_NOT_IMPLEMENTED; } -void EventToolkitCUDA::update_record( +void EventToolkit::update_record( uint32_t records_size_after_sync, std::vector &traced_records) { TI_NOT_IMPLEMENTED; } -void EventToolkitCUDA::update_timeline( +void EventToolkit::update_timeline( std::vector &traced_records) { TI_NOT_IMPLEMENTED; } diff --git a/taichi/rhi/cuda/cuda_profiler.h b/taichi/rhi/cuda/cuda_profiler.h index 9b3765abf6403..db86d1e743df1 100644 --- a/taichi/rhi/cuda/cuda_profiler.h +++ b/taichi/rhi/cuda/cuda_profiler.h @@ -17,7 +17,7 @@ enum class ProfilingToolkit : int { cupti, }; -class EventToolkitCUDA; +class EventToolkit; // A CUDA kernel profiler class KernelProfilerCUDA : public KernelProfilerBase { @@ -55,7 +55,7 @@ class KernelProfilerCUDA : public KernelProfilerBase { // Instances of these toolkits may exist at the same time, // but only one will be enabled. - std::unique_ptr event_toolkit_{nullptr}; + std::unique_ptr event_toolkit_{nullptr}; std::unique_ptr cupti_toolkit_{nullptr}; std::vector metric_list_; uint32_t records_size_after_sync_{0}; From 1e625b09a1a98628477200e8fcfe534cee2605c3 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 15 Feb 2023 05:35:05 +0000 Subject: [PATCH 15/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- taichi/rhi/amdgpu/amdgpu_profiler.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/taichi/rhi/amdgpu/amdgpu_profiler.h b/taichi/rhi/amdgpu/amdgpu_profiler.h index 4689aaf03b856..3de30c82ff3f9 100644 --- a/taichi/rhi/amdgpu/amdgpu_profiler.h +++ b/taichi/rhi/amdgpu/amdgpu_profiler.h @@ -44,13 +44,11 @@ class KernelProfilerAMDGPU : public KernelProfilerBase { class EventToolkitAMDGPU { public: - void update_record( - uint32_t records_size_after_sync, - std::vector &traced_records); + void update_record(uint32_t records_size_after_sync, + std::vector &traced_records); KernelProfilerBase::TaskHandle start_with_handle( const std::string &kernel_name); - void update_timeline( - std::vector &traced_records); + void update_timeline(std::vector &traced_records); void clear() { event_records_.clear(); }