PaddlePaddle · From00 · Nov 21, 2023 · Oct 18, 2023 · Oct 20, 2023 · Oct 20, 2023
diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h
@@ -38,6 +38,10 @@
 #include "paddle/fluid/platform/device_event.h"
 #include "paddle/phi/backends/device_manager.h"
 
+#if defined(PADDLE_WITH_CUDA)
+#include "paddle/phi/kernels/autotune/gpu_timer.h"
+#endif
+
 PD_DECLARE_bool(new_executor_serial_run);
 PD_DECLARE_bool(new_executor_static_build);
 PD_DECLARE_bool(new_executor_use_inplace);
@@ -103,6 +107,8 @@ class InterpreterBaseImpl {
       std::vector<paddle::framework::OpFuncNode>* op_func_nodes) = 0;
 
   virtual bool IsStaticBuild() const = 0;
+
+  virtual std::tuple<double, double> InterpreterRunTime() = 0;
 };
 
 inline void SetDeviceId(const platform::Place& place) {

diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -34,6 +34,10 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                             true,
                             "Use local_scope in new executor(especially used "
                             "in UT), can turn off for better performance");
+PADDLE_DEFINE_EXPORTED_bool(auto_parallel_profiler,
+                            false,
+                            "Enable auto parallel profiler, collecting the "
+                            "runtime of jobs in different devices");
 
 namespace paddle {
 namespace framework {
@@ -129,5 +133,9 @@ void InterpreterCore::Build(
 
 bool InterpreterCore::IsStaticBuild() const { return impl_->IsStaticBuild(); }
 
+std::tuple<double, double> InterpreterCore::InterpreterRunTime() {
+  return impl_->InterpreterRunTime();
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -79,6 +79,8 @@ class InterpreterCore {
 
   bool IsStaticBuild() const;
 
+  std::tuple<double, double> InterpreterRunTime();
+
  private:
   DISABLE_COPY_AND_ASSIGN(InterpreterCore);
 

diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc
@@ -276,6 +276,11 @@ void NewIRInterpreter::ShareBuildResultsFrom(const InterpreterBaseImpl& src) {
           << ") to InterpreterCore(" << this << ")";
 }
 
+std::tuple<double, double> NewIRInterpreter::InterpreterRunTime() {
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "NewIRInterpreter::InterpreterRunTime is not implemented."));
+}
+
 const interpreter::NewIrDependencyBuilder&
 NewIRInterpreter::GetNewIrDependencyBuilder() const {
   return ir_dependency_builder_;

diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h
@@ -60,6 +60,8 @@ class NewIRInterpreter : public InterpreterBaseImpl {
 
   void ShareBuildResultsFrom(const InterpreterBaseImpl& src) override;
 
+  std::tuple<double, double> InterpreterRunTime() override;
+
   std::shared_ptr<std::vector<size_t>> GetDependencyCount() const override;
 
   bool IsSharedResultsBuild() const override;

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -39,6 +39,7 @@
 #include "paddle/phi/core/flags.h"
 PHI_DECLARE_bool(dynamic_static_unified_comm);
 #endif
+PHI_DECLARE_bool(auto_parallel_profiler);
 
 namespace paddle {
 namespace framework {
@@ -103,6 +104,16 @@ ProgramInterpreter::~ProgramInterpreter() {
 }
 
 void ProgramInterpreter::RunImpl() {
+#if defined(PADDLE_WITH_CUDA)
+  if (FLAGS_auto_parallel_profiler) {
+    // Note(sonder): Record the start time of the each stream.
+    for (size_t i = 0; i < stream_timers_.size(); ++i) {
+      auto& stream_timer = stream_timers_[i];
+      stream_timer.Start();
+    }
+  }
+#endif
+
   // lazy initialization of gc, do not create gc is the program only run once
   if (!gc_) {
     gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
@@ -127,6 +138,15 @@ void ProgramInterpreter::RunImpl() {
     platform::DeviceContextPool::Instance().Get(place_)->Wait();
   }
 #endif
+
+#if defined(PADDLE_WITH_CUDA)
+  if (FLAGS_auto_parallel_profiler) {
+    for (size_t i = 0; i < stream_timers_.size(); ++i) {
+      auto& stream_timer = stream_timers_[i];
+      stream_timer.Stop();
+    }
+  }
+#endif
 }
 
 FetchList ProgramInterpreter::Run(const std::vector<std::string>& feed_names,
@@ -622,6 +642,62 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
   }
 }
 
+void ProgramInterpreter::AddGpuStreamEvents() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  stream_timers_.clear();
+  std::vector<gpuStream_t> streams;
+  bool has_default_stream = false;
+
+  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
+    auto& instr = vec_instruction_[i];
+    if ((instr.KernelType() != OpFuncType::kGpuAsync) ||
+        (instr.DeviceContext().GetPlace().GetType() ==
+         phi::AllocationType::CUSTOM)) {
+      continue;
+    }
+
+    gpuStream_t stream =
+        reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext())
+            .stream();
+
+    if (stream != nullptr) {
+      has_default_stream = true;
+    }
+  }
+  size_t timers_size = has_default_stream ? streams.size() + 1 : streams.size();
+  stream_timers_.resize(timers_size);
+  for (size_t i = 0; i < streams.size(); ++i) {
+    stream_timers_[i].SetStream(streams[i]);
+  }
+  if (has_default_stream) {
+    stream_timers_.back().SetStream(nullptr);
+  }
+
+#endif
+}
+
+std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
+  double min_start_time = std::numeric_limits<double>::max(),
+         max_end_time = std::numeric_limits<double>::min();
+#if defined(PADDLE_WITH_CUDA)
+  for (size_t i = 0; i < stream_timers_.size(); ++i) {
+    auto& stream_timer = stream_timers_[i];
+    double start_time = stream_timer.StartTime();
+    double end_time = stream_timer.EndTime();
+
+    min_start_time = std::min(min_start_time, start_time);
+    max_end_time = std::max(max_end_time, end_time);
+
+    VLOG(3) << "ProgramInterpreter::InterpreterRunTime:"
+            << "start_time: " << std::to_string(start_time)
+            << ", end_time: " << std::to_string(end_time) << ", min_start_time"
+            << std::to_string(min_start_time)
+            << ", max_end_time: " << std::to_string(max_end_time);
+  }
+#endif
+  return std::make_tuple(min_start_time, max_end_time);
+}
+
 void ProgramInterpreter::Convert(
     std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
   auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
@@ -658,6 +734,10 @@ void ProgramInterpreter::Convert(
     vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
   }
 
+  if (FLAGS_auto_parallel_profiler) {
+    AddGpuStreamEvents();
+  }
+
   BuildOperatorDependences();
 
   // NOTE(Ruibiao): For cross-step stream synchronization, an event may be

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -98,6 +98,8 @@ class ProgramInterpreter : public InterpreterBaseImpl {
 
   bool IsStaticBuild() const override { return static_build_; }
 
+  std::tuple<double, double> InterpreterRunTime() override;
+
  private:
   // build graph
   void Convert(std::vector<paddle::framework::OpFuncNode>* op_func_nodes);
@@ -149,6 +151,8 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   // For log and debug
   std::string GetDepsString() const;
 
+  void AddGpuStreamEvents();
+
   bool is_build_{false};
   bool static_build_{false};
   // Note(sonder): share the op dependency and event analysis procedure.
@@ -210,6 +214,10 @@ class ProgramInterpreter : public InterpreterBaseImpl {
   InstructionSchedulingPriorityLess instruction_scheduling_priority_less;
 
   std::vector<HookFunc> hookfuncs_;
+
+#if defined(PADDLE_WITH_CUDA)
+  std::vector<phi::GpuTimer> stream_timers_;
+#endif
 };
 
 }  // namespace framework

diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -30,6 +30,7 @@
 PHI_DECLARE_bool(enable_new_ir_in_executor);
 PHI_DECLARE_bool(enable_pir_api);
 PHI_DECLARE_bool(new_ir_apply_inplace_pass);
+PHI_DECLARE_bool(auto_parallel_profiler);
 
 namespace paddle {
 namespace framework {
@@ -205,6 +206,23 @@ paddle::framework::FetchList StandaloneExecutor::Run(
     }
   }
 
+  // record each job's run time
+#if defined(PADDLE_WITH_CUDA)
+  if (FLAGS_auto_parallel_profiler && !FLAGS_enable_new_ir_in_executor) {
+    for (size_t job_idx = 0; job_idx < jobs.size(); ++job_idx) {
+      const auto& job = jobs[job_idx];
+      const std::string& job_type = job->Type();
+      double start_time, end_time;
+      std::tie(start_time, end_time) =
+          interpretercores_[job_idx]->InterpreterRunTime();
+      VLOG(0) << "Profiler Info: Job (" << job_idx << "), type = " << job_type
+              << ", micro_batch_id = " << job->MicroBatchId()
+              << ", job_start_time = " << std::to_string(start_time)
+              << ", job_end_time = " << std::to_string(end_time);
+    }
+  }
+#endif
+
   // return Fetch Tensors
   if (FLAGS_enable_new_ir_in_executor) {
     framework::FetchList fetch_res;

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <sys/time.h>
+
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
@@ -68,7 +70,18 @@ class GpuTimer {
 #endif
   }
 
-  float ElapsedTime() {
+  void Start() {
+    struct timeval time_now {};
+    gettimeofday(&time_now, nullptr);
+    start_time_ = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
+    Start(stream_);
+  }
+
+  void Stop() { Stop(stream_); }
+
+  void SetStream(gpuStream_t stream) { stream_ = stream; }
+
+  double ElapsedTime() {
     float milliseconds = 0;
 #ifdef PADDLE_WITH_HIP
     hipEventSynchronize(stop_);
@@ -80,9 +93,15 @@ class GpuTimer {
     return milliseconds;
   }
 
+  double StartTime() { return start_time_; }
+
+  double EndTime() { return ElapsedTime() + start_time_; }
+
  private:
   gpuEvent_t start_;
   gpuEvent_t stop_;
+  gpuStream_t stream_;
+  double start_time_;
 };
 
 }  // namespace phi