-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AutoParallel] Visualize flow parallel timing diagram in static graph mode #58313
Changes from 13 commits
c514fbd
0147f70
6d1dc3d
6f4f67c
14fd116
4d51610
c70d9f9
ad0f17a
e0442c6
a8a37bb
a20e6ce
3e10a6d
59b425e
ddc5038
14f6228
1dfc816
9f271ef
dabf964
6ad6f36
d58cc94
e9886ae
282285b
3b0db0c
fdc3f6d
1ceadc5
679cc39
8953ae9
1a04fea
e1c619d
58c9f65
9c8b740
24b7e79
63de31b
8218ecb
4b318fc
9f949f2
ffc7b39
1925dd7
d299723
5297b7a
8bfb6c0
13b14d1
5bb55e1
f422b33
ed5f7fc
718cf17
f36b57b
444b7a7
f494916
28f089f
a2b5988
fb748d9
aa5570d
6b18e10
f096253
560fb61
bbb3071
e15c19e
989348c
10b84d8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,10 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, | |
true, | ||
"Use local_scope in new executor(especially used " | ||
"in UT), can turn off for better performance"); | ||
PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 为什么还需要这个FLAGS? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 已删除 |
||
false, | ||
"Enable auto parallel profiler, collecting the " | ||
"runtime of jobs in different devices"); | ||
|
||
namespace paddle { | ||
namespace framework { | ||
|
@@ -129,5 +133,9 @@ void InterpreterCore::Build( | |
|
||
bool InterpreterCore::IsStaticBuild() const { return impl_->IsStaticBuild(); } | ||
|
||
std::tuple<double, double> InterpreterCore::InterpreterRunTime() { | ||
return impl_->InterpreterRunTime(); | ||
} | ||
|
||
} // namespace framework | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,7 @@ | |
#include "paddle/phi/core/flags.h" | ||
PHI_DECLARE_bool(dynamic_static_unified_comm); | ||
#endif | ||
PHI_DECLARE_bool(auto_parallel_profiler); | ||
|
||
namespace paddle { | ||
namespace framework { | ||
|
@@ -103,6 +104,16 @@ ProgramInterpreter::~ProgramInterpreter() { | |
} | ||
|
||
void ProgramInterpreter::RunImpl() { | ||
#if defined(PADDLE_WITH_CUDA) | ||
if (FLAGS_auto_parallel_profiler) { | ||
// Note(sonder): Record the start time of the each stream. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NOTE一般用于解释一些复杂、难以阅读的代码,或提示一些从代码中无法表达的信息。这几行代码非常简单直接,这个NOTE也只是把代码重复讲一遍,可以不需要。 |
||
for (size_t i = 0; i < stream_timers_.size(); ++i) { | ||
auto& stream_timer = stream_timers_[i]; | ||
stream_timer.Start(); | ||
} | ||
} | ||
#endif | ||
|
||
// lazy initialization of gc, do not create gc is the program only run once | ||
if (!gc_) { | ||
gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_); | ||
|
@@ -127,6 +138,15 @@ void ProgramInterpreter::RunImpl() { | |
platform::DeviceContextPool::Instance().Get(place_)->Wait(); | ||
} | ||
#endif | ||
|
||
#if defined(PADDLE_WITH_CUDA) | ||
if (FLAGS_auto_parallel_profiler) { | ||
for (size_t i = 0; i < stream_timers_.size(); ++i) { | ||
auto& stream_timer = stream_timers_[i]; | ||
stream_timer.Stop(); | ||
} | ||
} | ||
#endif | ||
} | ||
|
||
FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names, | ||
|
@@ -622,6 +642,62 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { | |
} | ||
} | ||
|
||
void ProgramInterpreter::AddGpuStreamEvents() { | ||
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) | ||
stream_timers_.clear(); | ||
std::vector<gpuStream_t> streams; | ||
bool has_default_stream = false; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Paddle框架不会使用空流,不需要处理空流的情况。 |
||
|
||
for (size_t i = 0; i < vec_instruction_.size(); ++i) { | ||
auto& instr = vec_instruction_[i]; | ||
if ((instr.KernelType() != OpFuncType::kGpuAsync) || | ||
(instr.DeviceContext().GetPlace().GetType() == | ||
phi::AllocationType::CUSTOM)) { | ||
continue; | ||
} | ||
|
||
gpuStream_t stream = | ||
reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()) | ||
.stream(); | ||
|
||
if (stream != nullptr) { | ||
has_default_stream = true; | ||
} | ||
} | ||
size_t timers_size = has_default_stream ? streams.size() + 1 : streams.size(); | ||
stream_timers_.resize(timers_size); | ||
for (size_t i = 0; i < streams.size(); ++i) { | ||
stream_timers_[i].SetStream(streams[i]); | ||
} | ||
if (has_default_stream) { | ||
stream_timers_.back().SetStream(nullptr); | ||
} | ||
|
||
#endif | ||
} | ||
|
||
std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() { | ||
double min_start_time = std::numeric_limits<double>::max(), | ||
max_end_time = std::numeric_limits<double>::min(); | ||
#if defined(PADDLE_WITH_CUDA) | ||
for (size_t i = 0; i < stream_timers_.size(); ++i) { | ||
auto& stream_timer = stream_timers_[i]; | ||
double start_time = stream_timer.StartTime(); | ||
double end_time = stream_timer.EndTime(); | ||
|
||
min_start_time = std::min(min_start_time, start_time); | ||
max_end_time = std::max(max_end_time, end_time); | ||
|
||
VLOG(3) << "ProgramInterpreter::InterpreterRunTime:" | ||
<< "start_time: " << std::to_string(start_time) | ||
<< ", end_time: " << std::to_string(end_time) << ", min_start_time" | ||
<< std::to_string(min_start_time) | ||
<< ", max_end_time: " << std::to_string(max_end_time); | ||
} | ||
#endif | ||
return std::make_tuple(min_start_time, max_end_time); | ||
} | ||
|
||
void ProgramInterpreter::Convert( | ||
std::vector<paddle::framework::OpFuncNode>* op_func_nodes) { | ||
auto& vec_meta_info = var_scope_.MutableVecMetaInfo(); | ||
|
@@ -658,6 +734,10 @@ void ProgramInterpreter::Convert( | |
vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); | ||
} | ||
|
||
if (FLAGS_auto_parallel_profiler) { | ||
AddGpuStreamEvents(); | ||
} | ||
|
||
BuildOperatorDependences(); | ||
|
||
// NOTE(Ruibiao): For cross-step stream synchronization, an event may be | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
PHI_DECLARE_bool(enable_new_ir_in_executor); | ||
PHI_DECLARE_bool(enable_pir_api); | ||
PHI_DECLARE_bool(new_ir_apply_inplace_pass); | ||
PHI_DECLARE_bool(auto_parallel_profiler); | ||
|
||
namespace paddle { | ||
namespace framework { | ||
|
@@ -205,6 +206,23 @@ paddle::framework::FetchList StandaloneExecutor::Run( | |
} | ||
} | ||
|
||
// record each job's run time | ||
#if defined(PADDLE_WITH_CUDA) | ||
if (FLAGS_auto_parallel_profiler && !FLAGS_enable_new_ir_in_executor) { | ||
for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) { | ||
const auto& job = jobs[job_idx]; | ||
const std::string& job_type = job->Type(); | ||
double start_time, end_time; | ||
std::tie(start_time, end_time) = | ||
interpretercores_[job_idx]->InterpreterRunTime(); | ||
VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里加注释说明这个log的作用,否则其它人不了解的情况下可能错误改动 |
||
<< ", micro_batch_id = " << job->MicroBatchId() | ||
<< ", job_start_time = " << std::to_string(start_time) | ||
<< ", job_end_time = " << std::to_string(end_time); | ||
} | ||
} | ||
#endif | ||
|
||
// return Fetch Tensors | ||
if (FLAGS_enable_new_ir_in_executor) { | ||
framework::FetchList fetch_res; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ | |
|
||
#pragma once | ||
|
||
#include <sys/time.h> | ||
|
||
#include "paddle/phi/backends/gpu/gpu_decls.h" | ||
#include "paddle/phi/core/enforce.h" | ||
#include "paddle/phi/core/errors.h" | ||
|
@@ -68,7 +70,18 @@ class GpuTimer { | |
#endif | ||
} | ||
|
||
float ElapsedTime() { | ||
void Start() { | ||
struct timeval time_now {}; | ||
gettimeofday(&time_now, nullptr); | ||
start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里可以加注释说明为何需要用CPU时间作为start_time |
||
Start(stream_); | ||
} | ||
|
||
void Stop() { Stop(stream_); } | ||
|
||
void SetStream(gpuStream_t stream) { stream_ = stream; } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 通过 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这个地方的代码没完全更新上,已更新最新版本,还需要麻烦看一下,通过SetStream设置stream再调用无参的Start和Stop函数 中包含了 cudaStreamAddCallback 的逻辑 |
||
|
||
double ElapsedTime() { | ||
float milliseconds = 0; | ||
#ifdef PADDLE_WITH_HIP | ||
hipEventSynchronize(stop_); | ||
|
@@ -80,9 +93,15 @@ class GpuTimer { | |
return milliseconds; | ||
} | ||
|
||
double StartTime() { return start_time_; } | ||
|
||
double EndTime() { return ElapsedTime() + start_time_; } | ||
|
||
private: | ||
gpuEvent_t start_; | ||
gpuEvent_t stop_; | ||
gpuStream_t stream_; | ||
double start_time_; | ||
}; | ||
|
||
} // namespace phi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
gpu_timer只与类接口具体的实现方式相关,与类定义无关,应只在使用到的.cc文件中include,而不在基类头文件中include