From 5e1faecc8931fe72f1d5c8789b0bcd822d497e00 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 17 Jul 2023 03:27:59 +0000 Subject: [PATCH 01/18] add interface --- .../interpreter/dependency_builder.cc | 51 +++++++++++++++ .../interpreter/dependency_builder.h | 59 ++++++++++++++++++ .../new_executor/new_ir_interpreter.cc | 62 +++++++++++++++++++ .../new_executor/new_ir_interpreter.h | 9 +++ 4 files changed, 181 insertions(+) diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 76fb08baca47cb..41e5afa6bf4e8e 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/new_executor/interpreter/dependency_builder.h" #include +#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/platform/flags.h" PADDLE_DEFINE_EXPORTED_bool( @@ -527,6 +528,56 @@ void DependencyBuilder::ShrinkDownstreamMap() { << StringizeDownstreamMap(op_downstream_map_); } +// /// ======================== /// +// /// For new ir /// +// /// ======================== /// +const std::map>& IrDependencyBuilder::Build( + const std::vector>& + instructions) { + if (is_build_) { + return op_downstream_map_; + } + + instructions_ = &instructions; + op_num_ = instructions_->size(); + + ops_before_.assign(op_num_, {}); + ops_behind_.assign(op_num_, {}); + op_happens_before_.assign(op_num_, std::vector(op_num_, false)); + + // BuildDownstreamMap(); + // VLOG(6) << "Finish BuildDownstreamMap"; + + // ShrinkDownstreamMap(); + // VLOG(6) << "Finish ShrinkDownstreamMap"; + + // if (FLAGS_new_executor_sequential_run) { + // AddDependencyForSequentialRun(); + // } + + // AddDependencyForCoalesceTensorOp(); + + // if (FLAGS_add_dependency_for_communication_op) { + // AddDependencyForCommunicationOp(); + // VLOG(6) << "Finish AddDependencyForSequentialRun"; + // } + + // AddDependencyForRandomOp(); + // VLOG(6) << "Finish AddDependencyForRandomOp"; + + // AddDependencyForReadOp(); + // VLOG(6) << "Finish AddDependencyForReadOp"; + + // VLOG(6) << "Finish build dependency"; + // VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); + // VLOG(8) << "downstream_map: " << std::endl + // << StringizeDownstreamMap(op_downstream_map_); + + is_build_ = true; + + return op_downstream_map_; +} + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h index 4d427f01fd4b72..eb65c23f4a6527 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h @@ -23,6 +23,7 @@ DECLARE_bool(new_executor_sequential_run); namespace paddle { namespace framework { +class InstructionBase; namespace interpreter { // DependencyBuilder provides some dependency adding function to handle the @@ -84,6 +85,64 @@ class DependencyBuilder { std::vector> op_happens_before_; }; +// /// ======================== /// +// /// For new ir /// +// /// ======================== /// +class IrDependencyBuilder { + public: + IrDependencyBuilder() : is_build_(false), instructions_(nullptr) {} + + // build op dependencies and return the mapping from op to its downstream-op + // set + const std::map>& Build( + const std::vector>& + instructions); + + const std::map>& OpDownstreamMap() const; + + bool OpHappensBefore(size_t prior_op_idx, size_t posterior_op_idx) const { + PADDLE_ENFORCE_GE( + op_happens_before_.size(), + 0, + phi::errors::Unavailable("op_happen_before is not yet built")); + return op_happens_before_.at(prior_op_idx).at(posterior_op_idx); + } + + private: + void AddDependencyForCoalesceTensorOp(); + void AddDependencyForCommunicationOp(); + void AddDependencyForRandomOp(); + void AddDependencyForReadOp(); + void AddDependencyForSequentialRun(); + + void AddDownstreamOp(size_t prior_op_idx, size_t posterior_op_idx); + + void BuildDownstreamMap(); + + void ShrinkDownstreamMap(); + + bool is_build_; + const std::vector>* + instructions_; // not_own + size_t op_num_; + + // ops_behind_ is the adjacency list about op to its posterior-ops, that is to + // say, op_behind_[i] == {a, b, c} means op[a], op[b] and op[c] depend on + // op[i] directly or indirectly. ops_before_ is the revered adjacency list of + // ops_behind_. + std::vector> ops_before_; + std::vector> ops_behind_; + + // op_downstream_map_ is the mapping from op to its downstream-op set, that is + // to say, op_downstream_map_[i] == {a, b, c} means op[a], op[b] and op[c] + // depend on op[i] directly. + std::map> op_downstream_map_; + + // op_happens_before_ is a matrix form of ops_before_ and ops_behind_, it is + // used to speed up the query. + std::vector> op_happens_before_; +}; + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 055c9ff2383cb6..08bd375c544821 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -191,6 +191,7 @@ FetchList NewIRInterpreter::Run(const std::vector& feed_names, &variable_2_var_name_, &var_name_2_id_, &variable_list_); + VLOG(4) << DebugValueInfo(); std::vector op_func_nodes; interpreter::BuildOpFuncList(place_, @@ -247,7 +248,9 @@ FetchList NewIRInterpreter::BetaRun(const std::vector& feed_names, &variable_2_var_name_, &var_name_2_id_, &variable_list_); + VLOG(4) << DebugValueInfo(); BuildInstruction(); + BuildInstructionDependences(); for (size_t instr_id = 0; instr_id < vec_instruction_base_.size(); ++instr_id) { vec_instruction_base_[instr_id]->Run(); @@ -1543,5 +1546,64 @@ void NewIRInterpreter::BuildInstruction() { } } +std::string NewIRInterpreter::DebugValueInfo() { + std::stringstream os; + os << "value info of interpretercore " << this << "\n" + << "value -> var_name -> id -> variable*" + << "\n"; + for (auto kv : value_2_var_name_) { + os << kv.first.impl() << " -> " << kv.second << " -> " + << var_name_2_id_.at(kv.second) << " -> " + << InnerScope()->FindVar(kv.second) << "\n"; + } + return os.str(); +} + +void NewIRInterpreter::BuildInstructionDependences() { + // analysis the dependences between instructions, add next_instr_list to each + // instr, and set the dependecy_count_ + size_t instr_num = vec_instruction_base_.size(); + dependecy_count_ = std::vector(instr_num, 0); + auto downstream_map = ir_dependency_builder_.Build(vec_instruction_base_); + + // for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { + // Instruction& cur_instr = vec_instruction_[instr_id]; + // const std::set& next_instr_ids = downstream_map[instr_id]; + + // if (FLAGS_new_executor_serial_run) { + // for (size_t next_instr_id : next_instr_ids) { + // cur_instr.AddNextInstrInSameThread(next_instr_id); + // } + // } else { + // if (cur_instr.KernelType() == OpFuncType::kGpuAsync) { + // for (size_t next_instr_id : next_instr_ids) { + // if (vec_instruction_[next_instr_id].KernelType() == + // OpFuncType::kGpuAsync) { + // cur_instr.AddNextInstrInSameThread(next_instr_id); + // } else { + // cur_instr.AddNextInstrInDifferentThread(next_instr_id); + // } + // } + // } else { + // bool has_instr_in_same_thread = false; + // for (size_t next_instr_id : next_instr_ids) { + // if (!has_instr_in_same_thread && + // vec_instruction_[next_instr_id].KernelType() != + // OpFuncType::kGpuAsync) { + // cur_instr.AddNextInstrInSameThread(next_instr_id); + // has_instr_in_same_thread = true; + // } else { + // cur_instr.AddNextInstrInDifferentThread(next_instr_id); + // } + // } + // } + // } + + // for (size_t next_instr_id : next_instr_ids) { + // ++dependecy_count_[next_instr_id]; + // } + // } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index 7f84fdfcdb8806..dfff64e35843f3 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -186,17 +186,26 @@ class NewIRInterpreter : public InterpreterBaseImpl { /// ======================== /// /// For new ir /// /// ======================== /// + std::string DebugValueInfo(); + void BuildInstruction(); + void BuildInstructionDependences(); + std::unique_ptr<::ir::Program> ir_program_{nullptr}; std::vector> vec_instruction_base_; std::unordered_map<::ir::Value, std::string> value_2_var_name_; + std::unordered_map variable_2_var_name_; + std::map var_name_2_id_; + std::vector variable_list_; + + interpreter::IrDependencyBuilder ir_dependency_builder_; }; } // namespace framework From 451c0b91dc2a58de45167c1f95793951d8f18157 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 17 Jul 2023 08:32:04 +0000 Subject: [PATCH 02/18] add code --- .../instruction/instruction_base.cc | 4 +- .../instruction/instruction_base.h | 21 ++-- .../instruction/phi_kernel_instruction.cc | 96 ++++++++++++-- .../instruction/phi_kernel_instruction.h | 13 +- .../interpreter/dependency_builder.cc | 119 +++++++++++++++++- .../interpreter/interpreter_util.cc | 4 +- .../new_executor/new_ir_interpreter.cc | 11 +- .../ir/phi_kernel_adaptor/phi_kernel_util.h | 38 +----- 8 files changed, 249 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index eb6394f97945d2..6c09d7aa2a13fd 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -84,12 +84,12 @@ void InstructionBase::AddInplace(Variable* in, Variable* out) { void InstructionBase::ClearInplace() { vec_inplace_in_to_out_.clear(); } void InstructionBase::SetInputs( - const std::map>& inputs) { + const std::unordered_map>& inputs) { input_index_ = inputs; } void InstructionBase::SetOutputs( - const std::map>& outputs) { + const std::unordered_map>& outputs) { output_index_ = outputs; } diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index cd9531660af6b4..7771158f15752f 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -22,6 +22,10 @@ #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/platform/event.h" +namespace ir { +class Value; +} // namespace ir + namespace paddle { namespace framework { @@ -103,21 +107,22 @@ class InstructionBase { std::map& GetMutableInplaceBackMap() { return inplace_back_map_; } const std::map& GetInplaceBackMap() { return inplace_back_map_; } - const std::map>& Inputs() const { + const std::unordered_map>& Inputs() const { return input_index_; } - std::map>& GetMutableInputs() { + std::unordered_map>& GetMutableInputs() { return input_index_; } - void SetInputs(const std::map>& inputs); + void SetInputs(const std::unordered_map>& inputs); - const std::map>& Outputs() const { + const std::unordered_map>& Outputs() const { return output_index_; } - std::map>& GetMutableOutputs() { + std::unordered_map>& GetMutableOutputs() { return output_index_; } - void SetOutputs(const std::map>& outputs); + void SetOutputs( + const std::unordered_map>& outputs); virtual void Run() = 0; @@ -147,8 +152,8 @@ class InstructionBase { vec_inplace_in_to_out_; // If not use share data, need this ? std::map inplace_back_map_; - std::map> input_index_; - std::map> output_index_; + std::unordered_map> input_index_; + std::unordered_map> output_index_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index 32c7e265e7ba6b..787cf564932b5e 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -73,7 +73,10 @@ PhiKernelInstruction::PhiKernelInstruction( ir::Operation* op, Scope* scope, Scope* local_scope, - const std::unordered_map<::ir::Value, std::string>& value_2_name_map) + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name) : InstructionBase(id, place) { auto op_attributes = op->attributes(); auto op_name = @@ -117,8 +120,10 @@ PhiKernelInstruction::PhiKernelInstruction( // op_func_node.scheduling_priority_ = 1; // } // } + VLOG(6) << "finish process dist attributes"; SetKernelType(AnalyseOpFuncType(op, place)); + VLOG(6) << "finish process analyse kernel type"; infer_meta_interface_ = op_info.GetInterfaceImpl(); @@ -134,7 +139,7 @@ PhiKernelInstruction::PhiKernelInstruction( paddle::small_vector, paddle::small_vector, false>(op, - value_2_name_map, + value_2_var_name, scope, local_scope, yaml_info_parser, @@ -159,13 +164,11 @@ PhiKernelInstruction::PhiKernelInstruction( paddle::small_vector, paddle::small_vector, true>(op, - value_2_name_map, + value_2_var_name, scope, local_scope, yaml_info_parser, - &kernel_context_, - &(GetMutableInputs()), - &(GetMutableOutputs())); + &kernel_context_); kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get( phi::TransToPhiPlace(kernel_key.backend()))); VLOG(6) << "finish process kernel context"; @@ -173,12 +176,89 @@ PhiKernelInstruction::PhiKernelInstruction( SetDeviceContext(phi::DeviceContextPool::Instance().Get( phi::TransToPhiPlace(kernel_key.backend()))); VLOG(6) << "finish process device context"; + + Scope* inner_scope = local_scope == nullptr ? scope : local_scope; + InitInputsOutputs( + op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name); + VLOG(6) << "finish process inputs outputs index"; +} + +std::vector GetValueIds( + ir::Value value, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name) { + std::vector ids; + std::string var_name = value_2_var_name.at(value); + ids.push_back(var_name_2_id.at(var_name)); + // NOTE(zhangbo): Value maybe a VariableRefArray + auto var = inner_scope->FindVar(var_name); + if (var->IsType()) { + auto& var_array = var->Get(); + for (size_t i = 0; i < var_array.size(); ++i) { + ids.push_back(var_name_2_id.at(variable_2_var_name.at(var_array[i]))); + } + } + return ids; +} + +void PhiKernelInstruction::InitInputsOutputs( + ::ir::Operation* op, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name) { + std::unordered_map> inputs; + for (size_t i = 0; i < op->num_operands(); i++) { + ir::Value value = op->operand(i); + if (value) { + PADDLE_ENFORCE_NE( + value_2_var_name.find(value), + value_2_var_name.end(), + phi::errors::PreconditionNotMet( + "input should in name map, [%d] 'th input of [%s] op", + i, + phi_op_name_)); + std::vector inputs_id = GetValueIds(value, + inner_scope, + value_2_var_name, + var_name_2_id, + variable_2_var_name); + inputs.emplace(value, inputs_id); + } + } + SetInputs(inputs); + VLOG(8) << "finish process inputs_index"; + std::unordered_map> outputs; + for (size_t i = 0; i < op->num_results(); i++) { + ir::Value value = op->result(i); + if (value) { + PADDLE_ENFORCE_NE( + value_2_var_name.find(value), + value_2_var_name.end(), + phi::errors::PreconditionNotMet( + "input should in name map, [%d] 'th input of [%s] op", + i, + phi_op_name_)); + std::vector outputs_id = GetValueIds(value, + inner_scope, + value_2_var_name, + var_name_2_id, + variable_2_var_name); + outputs.emplace(value, outputs_id); + } + } + SetOutputs(outputs); + VLOG(8) << "finish process outputs_index"; } void PhiKernelInstruction::Run() { - VLOG(5) << "Run op " << phi_op_name_ << " infer meta."; + VLOG(6) << "Run op " << phi_op_name_ << " infer meta."; infer_meta_interface_->infer_meta_(&(infer_meta_context_)); - VLOG(5) << "Run op " << phi_op_name_ << " kernel."; + VLOG(6) << "Run op " << phi_op_name_ << " kernel."; (*(phi_kernel_))(&(kernel_context_)); } diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h index 72a34f722ce057..9de6399a9f5266 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h @@ -33,7 +33,10 @@ class PhiKernelInstruction : public InstructionBase { ::ir::Operation* op, Scope* scope, Scope* local_scope, - const std::unordered_map<::ir::Value, std::string>& value_2_name_map); + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name); const std::string& PhiOpName() const { return phi_op_name_; } @@ -52,6 +55,14 @@ class PhiKernelInstruction : public InstructionBase { void Run() override; private: + void InitInputsOutputs( + ::ir::Operation* op, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name); + std::string phi_op_name_; paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{ diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 41e5afa6bf4e8e..b601751d9361c6 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -545,7 +545,7 @@ const std::map>& IrDependencyBuilder::Build( ops_behind_.assign(op_num_, {}); op_happens_before_.assign(op_num_, std::vector(op_num_, false)); - // BuildDownstreamMap(); + BuildDownstreamMap(); // VLOG(6) << "Finish BuildDownstreamMap"; // ShrinkDownstreamMap(); @@ -578,6 +578,123 @@ const std::map>& IrDependencyBuilder::Build( return op_downstream_map_; } +void IrDependencyBuilder::BuildDownstreamMap() { + auto var2min_rw_op = + std::map>(); // # map from variable id to read + // write op id. + auto var2recent_write_op = + std::map(); // # map from variable to recent write op. + auto op2dependences = + std::map>(); //# map from op to the dependence list, + // op must run after the dependence. + std::set + remove_duplicate; // remove the duplicate between inputs and outputs + + // reserve + for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { + op2dependences[op_idx] = std::set(); + } + + // auto update_var_min_rw_op = + // [](const std::map>& op2dependences, + // std::map>* var2min_rw_op, + // size_t cur_op, + // size_t rw_var) { + // // rw_var is inputs or outputs of cur_op + // // this function update the var2min_rw_op set . + // if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) { + // (*var2min_rw_op)[rw_var] = std::list(); + // } + // for (auto dep_op : op2dependences.at(cur_op)) { + // var2min_rw_op->at(rw_var).remove(dep_op); + // } + // var2min_rw_op->at(rw_var).push_back(cur_op); + // }; + + // for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { + // remove_duplicate.clear(); + // // step1: update the op2dependences structure + // for (auto& item : + // instructions_->at(op_idx).Inputs()) { // for all inputs(read only) + // for (auto var : item.second) { + // if (var2recent_write_op.count(var)) + // op2dependences[op_idx].insert(var2recent_write_op[var]); + // } + // } + + // for (auto& item : + // instructions_->at(op_idx).Outputs()) { // for all write vars + // for (auto var : item.second) { + // if (var2min_rw_op.count(var)) { + // for (auto dep_op : var2min_rw_op[var]) { + // op2dependences[op_idx].insert(dep_op); + // } + // } + // } + // } + // // the original output of inplace op is also change. + // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { + // auto& m = instructions_->at(op_idx).InplaceBackMap(); + // for (auto& p : m) { + // auto& var = p.second; + // if (var2min_rw_op.count(var)) { + // for (auto dep_op : var2min_rw_op[var]) { + // op2dependences[op_idx].insert(dep_op); + // } + // } + // } + // } + + // // step2: update 2 var2xxxx data structure + // for (auto& item : + // instructions_->at(op_idx).Outputs()) { // for all write vars + // for (auto var : item.second) { + // var2recent_write_op[var] = op_idx; + // var2min_rw_op[var] = {static_cast(op_idx)}; + // remove_duplicate.insert(var); + // } + // } + + // // NOTE(zhiqiu): The inplace op with `transfer` also changes + // // original output after that so add original output as well + // // original: a->op->a + // // after: a->data_transfer->a'->op->a'->transfer_back->a + // // which means op writes a and a' + // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { + // auto& m = instructions_->at(op_idx).InplaceBackMap(); + // for (auto& p : m) { + // auto var = p.second; + // var2recent_write_op[var] = op_idx; + // var2min_rw_op[var] = {static_cast(op_idx)}; + // remove_duplicate.insert(var); + // } + // } + + // for (auto& item : + // instructions_->at(op_idx).Inputs()) { // for all inputs(read only) + // for (auto var : item.second) { + // if (remove_duplicate.count(var) == + // 0) { // var in input list and in output list, so remove it. + // update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); + // } + // } + // } + // } + + // // convert op2dependences to downstream_map directly. op2dependences is op + // -> + // // it's dependences, we want to get op -> [next ops] map, where ops is the + // // next instruction of op. The size of downstream != size of op2dependences + // // since there are some ops that have no downstream-op. + // for (auto& item : op2dependences) { + // size_t op = item.first; + // for (auto dep_op : item.second) { + // AddDownstreamOp(dep_op, op); + // } + // } +} + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index bd63d20c21510f..523c0f90f8dd54 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -1010,9 +1010,7 @@ void BuildOpFuncList( scope, local_scope, op_yaml_info_parser, - &(op_func_node.kernel_context_), - &(op_func_node.input_index), - &(op_func_node.output_index)); + &(op_func_node.kernel_context_)); VLOG(6) << "finish process kernel context"; op_func_node.kernel_context_.SetDeviceContext( diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 08bd375c544821..7021c3835e5f62 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1537,8 +1537,15 @@ void NewIRInterpreter::BuildInstruction() { ++it) { VLOG(0) << "Build Instruction for op: " << op_idx; if ((*it)->dialect()->name() == "pd_kernel") { - vec_instruction_base_.emplace_back(std::make_unique( - op_idx++, place_, (*it), scope_, local_scope_, value_2_var_name_)); + vec_instruction_base_.emplace_back( + std::make_unique(op_idx++, + place_, + (*it), + scope_, + local_scope_, + value_2_var_name_, + var_name_2_id_, + variable_2_var_name_)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Now only support pd_kernel dialect.")); diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h index 7f6a804382921a..37e5085ab38951 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h @@ -55,15 +55,12 @@ template -void BuildPhiContext( - ir::Operation* op, - const std::unordered_map& name_map, - paddle::framework::Scope* scope, - paddle::framework::Scope* local_scope, - const paddle::dialect::OpYamlInfoParser& op_yaml_info, - Context* ctx, - std::map>* input_map = nullptr, - std::map>* output_map = nullptr) { +void BuildPhiContext(ir::Operation* op, + const std::unordered_map& name_map, + paddle::framework::Scope* scope, + paddle::framework::Scope* local_scope, + const paddle::dialect::OpYamlInfoParser& op_yaml_info, + Context* ctx) { paddle::framework::Scope* inner_scope = local_scope != nullptr ? local_scope : scope; VLOG(6) << "BuildPhiContext in scope[" << scope << "] inner_scope[" @@ -120,17 +117,6 @@ void BuildPhiContext( ir::Value ptr = op->operand(name2id.at(t)); auto in_var_name = name_map.at(ptr); - if (input_map != nullptr) { - // only deal with single input for now, [todo] need support multi input - // like concat - // TODO(phlrain): OpFuncNode need input_index and output_index, - // construct input_index and output_here, should remove input_index and - // output_index from OpFuncNode Each in_var_name named "inner_var_" + - // index, len("inner_var_") = 10 - - size_t tmp_id = std::atol(in_var_name.substr(4, 100).c_str()); - (*input_map)[std::to_string(name2id.at(t))].push_back(tmp_id); - } auto& tensor_attr_type = op_yaml_info.TensorAttrTypeName(t); VLOG(6) << "ctx->EmplaceBack mutable attr: " << t << "\t" << in_var_name; @@ -324,18 +310,6 @@ void BuildPhiContext( PADDLE_THROW( phi::errors::Unimplemented("only support DenseTensor and vector ")); } - - if (output_map != nullptr) { - // only deal with single input for now, [todo] need support multi input - // like concat - // TODO(phlrain): OpFuncNode need input_index and output_index, - // construct input_index and output_here, should remove input_index and - // output_index from OpFuncNode Each in_var_name named "inner_var_" + - // index, len("inner_var_") = 10 - - size_t tmp_id = std::atol(name.substr(4, 100).c_str()); - (*output_map)["out"].push_back(tmp_id); - } } } VLOG(6) << "Done build phi context"; From c4df6c32d19420ef8b206e14027ee7309bb33751 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 17 Jul 2023 09:41:27 +0000 Subject: [PATCH 03/18] add code --- .../instruction/instruction_base.h | 2 + .../instruction/phi_kernel_instruction.h | 4 +- .../interpreter/dependency_builder.cc | 366 ++++++++++++------ 3 files changed, 243 insertions(+), 129 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index 7771158f15752f..8c3b9c9385ef20 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -126,6 +126,8 @@ class InstructionBase { virtual void Run() = 0; + virtual const std::string& Name() const = 0; + private: size_t id_; diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h index 9de6399a9f5266..7d3eacd2e13234 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h @@ -38,8 +38,6 @@ class PhiKernelInstruction : public InstructionBase { const std::unordered_map& variable_2_var_name); - const std::string& PhiOpName() const { return phi_op_name_; } - phi::Kernel* PhiKernel() const { return phi_kernel_; } const phi::KernelContext& KernelContext() const { return kernel_context_; } @@ -54,6 +52,8 @@ class PhiKernelInstruction : public InstructionBase { void Run() override; + const std::string& Name() const override { return phi_op_name_; } + private: void InitInputsOutputs( ::ir::Operation* op, diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index b601751d9361c6..cea075e6560e27 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -531,6 +531,235 @@ void DependencyBuilder::ShrinkDownstreamMap() { // /// ======================== /// // /// For new ir /// // /// ======================== /// +void IrDependencyBuilder::AddDependencyForSequentialRun() { + size_t dependence_op_idx = ULLONG_MAX; + for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { + if (dependence_op_idx != ULLONG_MAX) { + AddDownstreamOp(dependence_op_idx, op_idx); + } + dependence_op_idx = op_idx; + } +} + +void IrDependencyBuilder::ShrinkDownstreamMap() { + // remove unnecessary downstream ops + // for example, a->b->c + // a: b, c + // b: c + // => + // a: b + // b: c + + // shrink, find the downstream op that has no other op in the + // downstream list happens before it + for (size_t i = 0; i < op_num_; ++i) { + if (op_downstream_map_.find(i) == op_downstream_map_.end()) { + continue; + } + + std::set minumum_nexts; + for (size_t item : op_downstream_map_.at(i)) { + bool not_after_any = true; + // find the op that is not executed after any + for (size_t other_item : op_downstream_map_.at(i)) { + if (OpHappensBefore(other_item, item)) { + VLOG(8) << "happens_before: " << other_item << "->" << item + << ", so skip " << item; + not_after_any = false; + break; + } + } + if (not_after_any) { + VLOG(8) << "downstream op of " << i << ": " << item; + minumum_nexts.insert(item); + } + } + // NOTE(Ruibiao): op_happens_before will not be changed when shrink + // dowstream map + op_downstream_map_.at(i) = minumum_nexts; + } + VLOG(8) << "Finish shrink downstream map"; + VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); + VLOG(8) << "downstream_map: " << std::endl + << StringizeDownstreamMap(op_downstream_map_); +} + +void IrDependencyBuilder::AddDownstreamOp(size_t prior_op_idx, + size_t posterior_op_idx) { + PADDLE_ENFORCE_EQ( + OpHappensBefore(posterior_op_idx, prior_op_idx), + false, + phi::errors::Unavailable( + "Can not add dependency %d->%d because %d is run before %d", + prior_op_idx, + posterior_op_idx, + posterior_op_idx, + prior_op_idx)); + + std::set& downstream_ops = op_downstream_map_[prior_op_idx]; + // NOTE(Ruibiao): Here the downstream map shrinking is best-effort, therefore + // ShrinkDownstreamMap after BuildDownstreamMap is still helpful. For example, + // a->c will not be shrinked in the following case: AddDownstreamOp(a, b) -> + // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrinked by + // ShrinkDownstreamMap. + for (size_t op_idx : downstream_ops) { + if (OpHappensBefore(op_idx, posterior_op_idx)) { + VLOG(7) << "Find dependencies " << prior_op_idx << "->" << op_idx << "->" + << posterior_op_idx << ", skip adding " << prior_op_idx << "->" + << posterior_op_idx; + return; + } + } + downstream_ops.insert(posterior_op_idx); + + std::vector prior_of_prior = ops_before_[prior_op_idx]; + std::vector posterior_of_posterior = ops_behind_[posterior_op_idx]; + + auto update_op_happen_before = [this](size_t prior_op_idx, + size_t posterior_op_idx) { + if (!op_happens_before_[prior_op_idx][posterior_op_idx]) { + op_happens_before_[prior_op_idx][posterior_op_idx] = true; + ops_before_[posterior_op_idx].push_back(prior_op_idx); + ops_behind_[prior_op_idx].push_back(posterior_op_idx); + } + }; + + update_op_happen_before(prior_op_idx, posterior_op_idx); + + // All ops before prior-op are also before posterior-op + for (size_t op_idx : prior_of_prior) { + update_op_happen_before(op_idx, posterior_op_idx); + } + + // All ops after posterior-op are also after prior-op + for (size_t op_idx : posterior_of_posterior) { + update_op_happen_before(prior_op_idx, op_idx); + } + + VLOG(8) << prior_op_idx << "->" << posterior_op_idx; + VLOG(8) << "Add dependency from " << instructions_->at(prior_op_idx)->Name() + << "(" << prior_op_idx << ") to " + << instructions_->at(posterior_op_idx)->Name() << "(" + << posterior_op_idx << ")"; +} + +void IrDependencyBuilder::BuildDownstreamMap() { + auto var2min_rw_op = + std::map>(); // # map from variable id to read + // write op id. + auto var2recent_write_op = + std::map(); // # map from variable to recent write op. + auto op2dependences = + std::map>(); //# map from op to the dependence list, + // op must run after the dependence. + std::set + remove_duplicate; // remove the duplicate between inputs and outputs + + // reserve + for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { + op2dependences[op_idx] = std::set(); + } + + auto update_var_min_rw_op = + [](const std::map>& op2dependences, + std::map>* var2min_rw_op, + size_t cur_op, + size_t rw_var) { + // rw_var is inputs or outputs of cur_op + // this function update the var2min_rw_op set . + if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) { + (*var2min_rw_op)[rw_var] = std::list(); + } + for (auto dep_op : op2dependences.at(cur_op)) { + var2min_rw_op->at(rw_var).remove(dep_op); + } + var2min_rw_op->at(rw_var).push_back(cur_op); + }; + + for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { + remove_duplicate.clear(); + // step1: update the op2dependences structure + for (auto& item : + instructions_->at(op_idx)->Inputs()) { // for all inputs(read only) + for (auto var : item.second) { + if (var2recent_write_op.count(var)) + op2dependences[op_idx].insert(var2recent_write_op[var]); + } + } + + for (auto& item : + instructions_->at(op_idx)->Outputs()) { // for all write vars + for (auto var : item.second) { + if (var2min_rw_op.count(var)) { + for (auto dep_op : var2min_rw_op[var]) { + op2dependences[op_idx].insert(dep_op); + } + } + } + } + + // // the original output of inplace op is also change. + // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { + // auto& m = instructions_->at(op_idx).InplaceBackMap(); + // for (auto& p : m) { + // auto& var = p.second; + // if (var2min_rw_op.count(var)) { + // for (auto dep_op : var2min_rw_op[var]) { + // op2dependences[op_idx].insert(dep_op); + // } + // } + // } + // } + + // step2: update 2 var2xxxx data structure + for (auto& item : + instructions_->at(op_idx)->Outputs()) { // for all write vars + for (auto var : item.second) { + var2recent_write_op[var] = op_idx; + var2min_rw_op[var] = {static_cast(op_idx)}; + remove_duplicate.insert(var); + } + } + + // // NOTE(zhiqiu): The inplace op with `transfer` also changes + // // original output after that so add original output as well + // // original: a->op->a + // // after: a->data_transfer->a'->op->a'->transfer_back->a + // // which means op writes a and a' + // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { + // auto& m = instructions_->at(op_idx).InplaceBackMap(); + // for (auto& p : m) { + // auto var = p.second; + // var2recent_write_op[var] = op_idx; + // var2min_rw_op[var] = {static_cast(op_idx)}; + // remove_duplicate.insert(var); + // } + // } + + for (auto& item : + instructions_->at(op_idx)->Inputs()) { // for all inputs(read only) + for (auto var : item.second) { + if (remove_duplicate.count(var) == + 0) { // var in input list and in output list, so remove it. + update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); + } + } + } + } + + // convert op2dependences to downstream_map directly. op2dependences is op -> + // it's dependences, we want to get op -> [next ops] map, where ops is the + // next instruction of op. The size of downstream != size of op2dependences + // since there are some ops that have no downstream-op. + for (auto& item : op2dependences) { + size_t op = item.first; + for (auto dep_op : item.second) { + AddDownstreamOp(dep_op, op); + } + } +} + const std::map>& IrDependencyBuilder::Build( const std::vector>& instructions) { @@ -546,14 +775,14 @@ const std::map>& IrDependencyBuilder::Build( op_happens_before_.assign(op_num_, std::vector(op_num_, false)); BuildDownstreamMap(); - // VLOG(6) << "Finish BuildDownstreamMap"; + VLOG(6) << "Finish BuildDownstreamMap"; - // ShrinkDownstreamMap(); - // VLOG(6) << "Finish ShrinkDownstreamMap"; + ShrinkDownstreamMap(); + VLOG(6) << "Finish ShrinkDownstreamMap"; - // if (FLAGS_new_executor_sequential_run) { - // AddDependencyForSequentialRun(); - // } + if (FLAGS_new_executor_sequential_run) { + AddDependencyForSequentialRun(); + } // AddDependencyForCoalesceTensorOp(); @@ -568,133 +797,16 @@ const std::map>& IrDependencyBuilder::Build( // AddDependencyForReadOp(); // VLOG(6) << "Finish AddDependencyForReadOp"; - // VLOG(6) << "Finish build dependency"; - // VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); - // VLOG(8) << "downstream_map: " << std::endl - // << StringizeDownstreamMap(op_downstream_map_); + VLOG(6) << "Finish build dependency"; + VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); + VLOG(8) << "downstream_map: " << std::endl + << StringizeDownstreamMap(op_downstream_map_); is_build_ = true; return op_downstream_map_; } -void IrDependencyBuilder::BuildDownstreamMap() { - auto var2min_rw_op = - std::map>(); // # map from variable id to read - // write op id. - auto var2recent_write_op = - std::map(); // # map from variable to recent write op. - auto op2dependences = - std::map>(); //# map from op to the dependence list, - // op must run after the dependence. - std::set - remove_duplicate; // remove the duplicate between inputs and outputs - - // reserve - for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { - op2dependences[op_idx] = std::set(); - } - - // auto update_var_min_rw_op = - // [](const std::map>& op2dependences, - // std::map>* var2min_rw_op, - // size_t cur_op, - // size_t rw_var) { - // // rw_var is inputs or outputs of cur_op - // // this function update the var2min_rw_op set . - // if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) { - // (*var2min_rw_op)[rw_var] = std::list(); - // } - // for (auto dep_op : op2dependences.at(cur_op)) { - // var2min_rw_op->at(rw_var).remove(dep_op); - // } - // var2min_rw_op->at(rw_var).push_back(cur_op); - // }; - - // for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { - // remove_duplicate.clear(); - // // step1: update the op2dependences structure - // for (auto& item : - // instructions_->at(op_idx).Inputs()) { // for all inputs(read only) - // for (auto var : item.second) { - // if (var2recent_write_op.count(var)) - // op2dependences[op_idx].insert(var2recent_write_op[var]); - // } - // } - - // for (auto& item : - // instructions_->at(op_idx).Outputs()) { // for all write vars - // for (auto var : item.second) { - // if (var2min_rw_op.count(var)) { - // for (auto dep_op : var2min_rw_op[var]) { - // op2dependences[op_idx].insert(dep_op); - // } - // } - // } - // } - // // the original output of inplace op is also change. - // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { - // auto& m = instructions_->at(op_idx).InplaceBackMap(); - // for (auto& p : m) { - // auto& var = p.second; - // if (var2min_rw_op.count(var)) { - // for (auto dep_op : var2min_rw_op[var]) { - // op2dependences[op_idx].insert(dep_op); - // } - // } - // } - // } - - // // step2: update 2 var2xxxx data structure - // for (auto& item : - // instructions_->at(op_idx).Outputs()) { // for all write vars - // for (auto var : item.second) { - // var2recent_write_op[var] = op_idx; - // var2min_rw_op[var] = {static_cast(op_idx)}; - // remove_duplicate.insert(var); - // } - // } - - // // NOTE(zhiqiu): The inplace op with `transfer` also changes - // // original output after that so add original output as well - // // original: a->op->a - // // after: a->data_transfer->a'->op->a'->transfer_back->a - // // which means op writes a and a' - // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { - // auto& m = instructions_->at(op_idx).InplaceBackMap(); - // for (auto& p : m) { - // auto var = p.second; - // var2recent_write_op[var] = op_idx; - // var2min_rw_op[var] = {static_cast(op_idx)}; - // remove_duplicate.insert(var); - // } - // } - - // for (auto& item : - // instructions_->at(op_idx).Inputs()) { // for all inputs(read only) - // for (auto var : item.second) { - // if (remove_duplicate.count(var) == - // 0) { // var in input list and in output list, so remove it. - // update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); - // } - // } - // } - // } - - // // convert op2dependences to downstream_map directly. op2dependences is op - // -> - // // it's dependences, we want to get op -> [next ops] map, where ops is the - // // next instruction of op. The size of downstream != size of op2dependences - // // since there are some ops that have no downstream-op. - // for (auto& item : op2dependences) { - // size_t op = item.first; - // for (auto dep_op : item.second) { - // AddDownstreamOp(dep_op, op); - // } - // } -} - } // namespace interpreter } // namespace framework } // namespace paddle From 5c0f95ae99bd676f15a6f0417086cf3abaa1ba09 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Mon, 17 Jul 2023 12:53:56 +0000 Subject: [PATCH 04/18] add code --- .../interpreter/dependency_builder.cc | 42 +---------- .../new_executor/new_ir_interpreter.cc | 70 +++++++++---------- 2 files changed, 37 insertions(+), 75 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index cea075e6560e27..77bc3b69b9ac79 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -649,6 +649,7 @@ void IrDependencyBuilder::BuildDownstreamMap() { // write op id. auto var2recent_write_op = std::map(); // # map from variable to recent write op. + auto op2dependences = std::map>(); //# map from op to the dependence list, @@ -699,19 +700,6 @@ void IrDependencyBuilder::BuildDownstreamMap() { } } - // // the original output of inplace op is also change. - // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { - // auto& m = instructions_->at(op_idx).InplaceBackMap(); - // for (auto& p : m) { - // auto& var = p.second; - // if (var2min_rw_op.count(var)) { - // for (auto dep_op : var2min_rw_op[var]) { - // op2dependences[op_idx].insert(dep_op); - // } - // } - // } - // } - // step2: update 2 var2xxxx data structure for (auto& item : instructions_->at(op_idx)->Outputs()) { // for all write vars @@ -722,21 +710,6 @@ void IrDependencyBuilder::BuildDownstreamMap() { } } - // // NOTE(zhiqiu): The inplace op with `transfer` also changes - // // original output after that so add original output as well - // // original: a->op->a - // // after: a->data_transfer->a'->op->a'->transfer_back->a - // // which means op writes a and a' - // if (!instructions_->at(op_idx).InplaceBackMap().empty()) { - // auto& m = instructions_->at(op_idx).InplaceBackMap(); - // for (auto& p : m) { - // auto var = p.second; - // var2recent_write_op[var] = op_idx; - // var2min_rw_op[var] = {static_cast(op_idx)}; - // remove_duplicate.insert(var); - // } - // } - for (auto& item : instructions_->at(op_idx)->Inputs()) { // for all inputs(read only) for (auto var : item.second) { @@ -784,18 +757,7 @@ const std::map>& IrDependencyBuilder::Build( AddDependencyForSequentialRun(); } - // AddDependencyForCoalesceTensorOp(); - - // if (FLAGS_add_dependency_for_communication_op) { - // AddDependencyForCommunicationOp(); - // VLOG(6) << "Finish AddDependencyForSequentialRun"; - // } - - // AddDependencyForRandomOp(); - // VLOG(6) << "Finish AddDependencyForRandomOp"; - - // AddDependencyForReadOp(); - // VLOG(6) << "Finish AddDependencyForReadOp"; + // TODO(zhangbo): Add dependency for special op. VLOG(6) << "Finish build dependency"; VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 7021c3835e5f62..9b9f0f8ff1aa66 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1573,43 +1573,43 @@ void NewIRInterpreter::BuildInstructionDependences() { dependecy_count_ = std::vector(instr_num, 0); auto downstream_map = ir_dependency_builder_.Build(vec_instruction_base_); - // for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { - // Instruction& cur_instr = vec_instruction_[instr_id]; - // const std::set& next_instr_ids = downstream_map[instr_id]; + for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { + InstructionBase* cur_instr = vec_instruction_base_[instr_id].get(); + const std::set& next_instr_ids = downstream_map[instr_id]; - // if (FLAGS_new_executor_serial_run) { - // for (size_t next_instr_id : next_instr_ids) { - // cur_instr.AddNextInstrInSameThread(next_instr_id); - // } - // } else { - // if (cur_instr.KernelType() == OpFuncType::kGpuAsync) { - // for (size_t next_instr_id : next_instr_ids) { - // if (vec_instruction_[next_instr_id].KernelType() == - // OpFuncType::kGpuAsync) { - // cur_instr.AddNextInstrInSameThread(next_instr_id); - // } else { - // cur_instr.AddNextInstrInDifferentThread(next_instr_id); - // } - // } - // } else { - // bool has_instr_in_same_thread = false; - // for (size_t next_instr_id : next_instr_ids) { - // if (!has_instr_in_same_thread && - // vec_instruction_[next_instr_id].KernelType() != - // OpFuncType::kGpuAsync) { - // cur_instr.AddNextInstrInSameThread(next_instr_id); - // has_instr_in_same_thread = true; - // } else { - // cur_instr.AddNextInstrInDifferentThread(next_instr_id); - // } - // } - // } - // } + if (FLAGS_new_executor_serial_run) { + for (size_t next_instr_id : next_instr_ids) { + cur_instr->AddNextInstrInSameThread(next_instr_id); + } + } else { + if (cur_instr->KernelType() == OpFuncType::kGpuAsync) { + for (size_t next_instr_id : next_instr_ids) { + if (vec_instruction_base_[next_instr_id]->KernelType() == + OpFuncType::kGpuAsync) { + cur_instr->AddNextInstrInSameThread(next_instr_id); + } else { + cur_instr->AddNextInstrInDifferentThread(next_instr_id); + } + } + } else { + bool has_instr_in_same_thread = false; + for (size_t next_instr_id : next_instr_ids) { + if (!has_instr_in_same_thread && + vec_instruction_base_[next_instr_id]->KernelType() != + OpFuncType::kGpuAsync) { + cur_instr->AddNextInstrInSameThread(next_instr_id); + has_instr_in_same_thread = true; + } else { + cur_instr->AddNextInstrInDifferentThread(next_instr_id); + } + } + } + } - // for (size_t next_instr_id : next_instr_ids) { - // ++dependecy_count_[next_instr_id]; - // } - // } + for (size_t next_instr_id : next_instr_ids) { + ++dependecy_count_[next_instr_id]; + } + } } } // namespace framework From 11b41067dc59a94ff419b4c9974bf05057469f70 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Tue, 18 Jul 2023 08:10:53 +0000 Subject: [PATCH 05/18] add code --- .../instruction/instruction_base.h | 6 + .../instruction/phi_kernel_instruction.cc | 20 +- .../instruction/phi_kernel_instruction.h | 2 +- .../interpreter/dependency_builder.cc | 246 +++++++++--------- .../new_executor/new_ir_interpreter.cc | 17 ++ .../fluid/ir/interface/op_yaml_info_parser.cc | 11 +- .../fluid/ir/interface/op_yaml_info_parser.h | 2 + 7 files changed, 167 insertions(+), 137 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index 8c3b9c9385ef20..a31b65c1039d63 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -137,24 +137,30 @@ class InstructionBase { // dist attrs:lower value, higher priority int stream_priority_{0}; + SchedulingPriority scheduling_priority_{0}; + std::string execution_stream_{kDefaultStream}; platform::DeviceContext* dev_ctx_; // not owned std::vector next_instrs_in_different_thread_; + std::vector next_instrs_in_same_thread_; std::shared_ptr event_to_record_; + std::vector events_to_wait_; std::vector gc_check_vars_; std::vector> vec_inplace_in_to_out_; // If not use share data, need this ? + std::map inplace_back_map_; std::unordered_map> input_index_; + std::unordered_map> output_index_; }; diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index 787cf564932b5e..566c61512335c3 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -84,14 +84,7 @@ PhiKernelInstruction::PhiKernelInstruction( ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name); phi_op_name_ = op_name; - - if (op_name == "builtin.combine" || op_name == "pd.feed" || - op_name == "builtin.set_parameter" || - op_name == "builtin.get_parameter") { - VLOG(6) << "skip process " << op_name; - SetArtificial(true); - return; - } + VLOG(6) << "construct phi kernel instruction for: " << phi_op_name_; // Todo: support paddle::dialect::DistAttribute // if (op_attributes.count("dist_attr") != 0) { @@ -127,10 +120,17 @@ PhiKernelInstruction::PhiKernelInstruction( infer_meta_interface_ = op_info.GetInterfaceImpl(); + VLOG(6) << "finish process infer_meta_interface_"; + auto yaml_interface = op_info.GetInterfaceImpl(); + PADDLE_ENFORCE_NOT_NULL( + yaml_interface, + phi::errors::PreconditionNotMet( + "can not find OpYamlInfoInterface from [%s]", phi_op_name_)); paddle::dialect::OpYamlInfoParser yaml_info_parser( yaml_interface->get_op_info_()); + VLOG(6) << "finish process yaml_info_parser"; ::ir::BuildPhiContext< phi::InferMetaContext, @@ -178,7 +178,7 @@ PhiKernelInstruction::PhiKernelInstruction( VLOG(6) << "finish process device context"; Scope* inner_scope = local_scope == nullptr ? scope : local_scope; - InitInputsOutputs( + InitInputsOutputsIds( op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name); VLOG(6) << "finish process inputs outputs index"; } @@ -204,7 +204,7 @@ std::vector GetValueIds( return ids; } -void PhiKernelInstruction::InitInputsOutputs( +void PhiKernelInstruction::InitInputsOutputsIds( ::ir::Operation* op, Scope* inner_scope, const std::unordered_map<::ir::Value, std::string>& value_2_var_name, diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h index 7d3eacd2e13234..b30fa8bff751b5 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h @@ -55,7 +55,7 @@ class PhiKernelInstruction : public InstructionBase { const std::string& Name() const override { return phi_op_name_; } private: - void InitInputsOutputs( + void InitInputsOutputsIds( ::ir::Operation* op, Scope* inner_scope, const std::unordered_map<::ir::Value, std::string>& value_2_var_name, diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 77bc3b69b9ac79..559e8d7afa6c74 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -528,119 +528,43 @@ void DependencyBuilder::ShrinkDownstreamMap() { << StringizeDownstreamMap(op_downstream_map_); } -// /// ======================== /// -// /// For new ir /// -// /// ======================== /// -void IrDependencyBuilder::AddDependencyForSequentialRun() { - size_t dependence_op_idx = ULLONG_MAX; - for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { - if (dependence_op_idx != ULLONG_MAX) { - AddDownstreamOp(dependence_op_idx, op_idx); - } - dependence_op_idx = op_idx; +/// ======================== /// +/// For new ir /// +/// ======================== /// +const std::map>& IrDependencyBuilder::Build( + const std::vector>& + instructions) { + if (is_build_) { + return op_downstream_map_; } -} -void IrDependencyBuilder::ShrinkDownstreamMap() { - // remove unnecessary downstream ops - // for example, a->b->c - // a: b, c - // b: c - // => - // a: b - // b: c + instructions_ = &instructions; + op_num_ = instructions_->size(); - // shrink, find the downstream op that has no other op in the - // downstream list happens before it - for (size_t i = 0; i < op_num_; ++i) { - if (op_downstream_map_.find(i) == op_downstream_map_.end()) { - continue; - } + ops_before_.assign(op_num_, {}); + ops_behind_.assign(op_num_, {}); + op_happens_before_.assign(op_num_, std::vector(op_num_, false)); - std::set minumum_nexts; - for (size_t item : op_downstream_map_.at(i)) { - bool not_after_any = true; - // find the op that is not executed after any - for (size_t other_item : op_downstream_map_.at(i)) { - if (OpHappensBefore(other_item, item)) { - VLOG(8) << "happens_before: " << other_item << "->" << item - << ", so skip " << item; - not_after_any = false; - break; - } - } - if (not_after_any) { - VLOG(8) << "downstream op of " << i << ": " << item; - minumum_nexts.insert(item); - } - } - // NOTE(Ruibiao): op_happens_before will not be changed when shrink - // dowstream map - op_downstream_map_.at(i) = minumum_nexts; - } - VLOG(8) << "Finish shrink downstream map"; - VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); - VLOG(8) << "downstream_map: " << std::endl - << StringizeDownstreamMap(op_downstream_map_); -} + BuildDownstreamMap(); + VLOG(6) << "Finish BuildDownstreamMap"; -void IrDependencyBuilder::AddDownstreamOp(size_t prior_op_idx, - size_t posterior_op_idx) { - PADDLE_ENFORCE_EQ( - OpHappensBefore(posterior_op_idx, prior_op_idx), - false, - phi::errors::Unavailable( - "Can not add dependency %d->%d because %d is run before %d", - prior_op_idx, - posterior_op_idx, - posterior_op_idx, - prior_op_idx)); + ShrinkDownstreamMap(); + VLOG(6) << "Finish ShrinkDownstreamMap"; - std::set& downstream_ops = op_downstream_map_[prior_op_idx]; - // NOTE(Ruibiao): Here the downstream map shrinking is best-effort, therefore - // ShrinkDownstreamMap after BuildDownstreamMap is still helpful. For example, - // a->c will not be shrinked in the following case: AddDownstreamOp(a, b) -> - // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrinked by - // ShrinkDownstreamMap. - for (size_t op_idx : downstream_ops) { - if (OpHappensBefore(op_idx, posterior_op_idx)) { - VLOG(7) << "Find dependencies " << prior_op_idx << "->" << op_idx << "->" - << posterior_op_idx << ", skip adding " << prior_op_idx << "->" - << posterior_op_idx; - return; - } + if (FLAGS_new_executor_sequential_run) { + AddDependencyForSequentialRun(); } - downstream_ops.insert(posterior_op_idx); - std::vector prior_of_prior = ops_before_[prior_op_idx]; - std::vector posterior_of_posterior = ops_behind_[posterior_op_idx]; + // TODO(zhangbo): Add dependency for special op ? - auto update_op_happen_before = [this](size_t prior_op_idx, - size_t posterior_op_idx) { - if (!op_happens_before_[prior_op_idx][posterior_op_idx]) { - op_happens_before_[prior_op_idx][posterior_op_idx] = true; - ops_before_[posterior_op_idx].push_back(prior_op_idx); - ops_behind_[prior_op_idx].push_back(posterior_op_idx); - } - }; - - update_op_happen_before(prior_op_idx, posterior_op_idx); - - // All ops before prior-op are also before posterior-op - for (size_t op_idx : prior_of_prior) { - update_op_happen_before(op_idx, posterior_op_idx); - } + VLOG(6) << "Finish build dependency"; + VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); + VLOG(8) << "downstream_map: " << std::endl + << StringizeDownstreamMap(op_downstream_map_); - // All ops after posterior-op are also after prior-op - for (size_t op_idx : posterior_of_posterior) { - update_op_happen_before(prior_op_idx, op_idx); - } + is_build_ = true; - VLOG(8) << prior_op_idx << "->" << posterior_op_idx; - VLOG(8) << "Add dependency from " << instructions_->at(prior_op_idx)->Name() - << "(" << prior_op_idx << ") to " - << instructions_->at(posterior_op_idx)->Name() << "(" - << posterior_op_idx << ")"; + return op_downstream_map_; } void IrDependencyBuilder::BuildDownstreamMap() { @@ -733,40 +657,116 @@ void IrDependencyBuilder::BuildDownstreamMap() { } } -const std::map>& IrDependencyBuilder::Build( - const std::vector>& - instructions) { - if (is_build_) { - return op_downstream_map_; +void IrDependencyBuilder::AddDownstreamOp(size_t prior_op_idx, + size_t posterior_op_idx) { + PADDLE_ENFORCE_EQ( + OpHappensBefore(posterior_op_idx, prior_op_idx), + false, + phi::errors::Unavailable( + "Can not add dependency %d->%d because %d is run before %d", + prior_op_idx, + posterior_op_idx, + posterior_op_idx, + prior_op_idx)); + + std::set& downstream_ops = op_downstream_map_[prior_op_idx]; + // NOTE(Ruibiao): Here the downstream map shrinking is best-effort, therefore + // ShrinkDownstreamMap after BuildDownstreamMap is still helpful. For example, + // a->c will not be shrinked in the following case: AddDownstreamOp(a, b) -> + // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrinked by + // ShrinkDownstreamMap. + for (size_t op_idx : downstream_ops) { + if (OpHappensBefore(op_idx, posterior_op_idx)) { + VLOG(7) << "Find dependencies " << prior_op_idx << "->" << op_idx << "->" + << posterior_op_idx << ", skip adding " << prior_op_idx << "->" + << posterior_op_idx; + return; + } } + downstream_ops.insert(posterior_op_idx); - instructions_ = &instructions; - op_num_ = instructions_->size(); + std::vector prior_of_prior = ops_before_[prior_op_idx]; + std::vector posterior_of_posterior = ops_behind_[posterior_op_idx]; - ops_before_.assign(op_num_, {}); - ops_behind_.assign(op_num_, {}); - op_happens_before_.assign(op_num_, std::vector(op_num_, false)); + auto update_op_happen_before = [this](size_t prior_op_idx, + size_t posterior_op_idx) { + if (!op_happens_before_[prior_op_idx][posterior_op_idx]) { + op_happens_before_[prior_op_idx][posterior_op_idx] = true; + ops_before_[posterior_op_idx].push_back(prior_op_idx); + ops_behind_[prior_op_idx].push_back(posterior_op_idx); + } + }; - BuildDownstreamMap(); - VLOG(6) << "Finish BuildDownstreamMap"; + update_op_happen_before(prior_op_idx, posterior_op_idx); - ShrinkDownstreamMap(); - VLOG(6) << "Finish ShrinkDownstreamMap"; + // All ops before prior-op are also before posterior-op + for (size_t op_idx : prior_of_prior) { + update_op_happen_before(op_idx, posterior_op_idx); + } - if (FLAGS_new_executor_sequential_run) { - AddDependencyForSequentialRun(); + // All ops after posterior-op are also after prior-op + for (size_t op_idx : posterior_of_posterior) { + update_op_happen_before(prior_op_idx, op_idx); } - // TODO(zhangbo): Add dependency for special op. + VLOG(8) << prior_op_idx << "->" << posterior_op_idx; + VLOG(8) << "Add dependency from " << instructions_->at(prior_op_idx)->Name() + << "(" << prior_op_idx << ") to " + << instructions_->at(posterior_op_idx)->Name() << "(" + << posterior_op_idx << ")"; +} - VLOG(6) << "Finish build dependency"; +void IrDependencyBuilder::ShrinkDownstreamMap() { + // remove unnecessary downstream ops + // for example, a->b->c + // a: b, c + // b: c + // => + // a: b + // b: c + + // shrink, find the downstream op that has no other op in the + // downstream list happens before it + for (size_t i = 0; i < op_num_; ++i) { + if (op_downstream_map_.find(i) == op_downstream_map_.end()) { + continue; + } + + std::set minumum_nexts; + for (size_t item : op_downstream_map_.at(i)) { + bool not_after_any = true; + // find the op that is not executed after any + for (size_t other_item : op_downstream_map_.at(i)) { + if (OpHappensBefore(other_item, item)) { + VLOG(8) << "happens_before: " << other_item << "->" << item + << ", so skip " << item; + not_after_any = false; + break; + } + } + if (not_after_any) { + VLOG(8) << "downstream op of " << i << ": " << item; + minumum_nexts.insert(item); + } + } + // NOTE(Ruibiao): op_happens_before will not be changed when shrink + // dowstream map + op_downstream_map_.at(i) = minumum_nexts; + } + VLOG(8) << "Finish shrink downstream map"; VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); VLOG(8) << "downstream_map: " << std::endl << StringizeDownstreamMap(op_downstream_map_); +} - is_build_ = true; - - return op_downstream_map_; +void IrDependencyBuilder::AddDependencyForSequentialRun() { + size_t dependence_op_idx = ULLONG_MAX; + for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { + if (dependence_op_idx != ULLONG_MAX) { + AddDownstreamOp(dependence_op_idx, op_idx); + } + dependence_op_idx = op_idx; + } } } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 9b9f0f8ff1aa66..ba5141509e1d1c 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -193,6 +193,11 @@ FetchList NewIRInterpreter::Run(const std::vector& feed_names, &variable_list_); VLOG(4) << DebugValueInfo(); + // NOTE(zhangbo): Iterative version, gradually replacing BuildOpFuncList() + // and Convert() + BuildInstruction(); + BuildInstructionDependences(); + std::vector op_func_nodes; interpreter::BuildOpFuncList(place_, ir_program_->block(), @@ -1537,6 +1542,18 @@ void NewIRInterpreter::BuildInstruction() { ++it) { VLOG(0) << "Build Instruction for op: " << op_idx; if ((*it)->dialect()->name() == "pd_kernel") { + auto op_name = (*it) + ->attributes() + .at("op_name") + .dyn_cast<::ir::StrAttribute>() + .data(); + if (op_name == "builtin.combine" || op_name == "builtin.slice" || + op_name == "pd.feed" || op_name == "pd.fetch" || + op_name == "builtin.set_parameter" || + op_name == "builtin.get_parameter") { + VLOG(6) << "skip process " << op_name; + continue; + } vec_instruction_base_.emplace_back( std::make_unique(op_idx++, place_, diff --git a/paddle/fluid/ir/interface/op_yaml_info_parser.cc b/paddle/fluid/ir/interface/op_yaml_info_parser.cc index b21e4f82a70cc9..c1b0dd73764ffb 100644 --- a/paddle/fluid/ir/interface/op_yaml_info_parser.cc +++ b/paddle/fluid/ir/interface/op_yaml_info_parser.cc @@ -88,6 +88,10 @@ const std::map& OpYamlInfoParser::InputName2Id() const { return input_name2id_; } +const std::map& OpYamlInfoParser::OutputName2Id() const { + return input_name2id_; +} + bool OpYamlInfoParser::HasInplace(const std::string& out_name) const { auto inplace_info = std::get<3>(op_info_tuple_).inplace; for (size_t i = 0; i < inplace_info.size(); i++) { @@ -113,10 +117,9 @@ const std::string& OpYamlInfoParser::InplaceName( void OpYamlInfoParser::parse() { auto input_info = std::get<0>(op_info_tuple_); - int start_index = 0; - + int input_start_index = 0; for (size_t i = 0; i < input_info.size(); ++i) { - input_name2id_[input_info[i].name] = start_index++; + input_name2id_[input_info[i].name] = input_start_index++; input_name_list_.push_back(input_info[i].name); input_info_[input_info[i].name] = input_info[i]; if (!input_info[i].is_mutable_attribute) { @@ -130,8 +133,10 @@ void OpYamlInfoParser::parse() { attr_info_[attribute_info[i].name] = attribute_info[i]; } + int output_start_index = 0; auto output_info = std::get<2>(op_info_tuple_); for (size_t i = 0; i < output_info.size(); ++i) { + output_name2id_[output_info[i].name] = output_start_index++; output_name_list_.push_back(output_info[i].name); output_info_[output_info[i].name] = output_info[i]; } diff --git a/paddle/fluid/ir/interface/op_yaml_info_parser.h b/paddle/fluid/ir/interface/op_yaml_info_parser.h index b2897b0fc2ecd6..6b600a6d70e812 100644 --- a/paddle/fluid/ir/interface/op_yaml_info_parser.h +++ b/paddle/fluid/ir/interface/op_yaml_info_parser.h @@ -35,6 +35,7 @@ class OpYamlInfoParser { const std::vector& AttrParams(bool is_kernel = false) const; const OpRunTimeInfo& OpRuntimeInfo() const; const std::map& InputName2Id() const; + const std::map& OutputName2Id() const; const std::vector& InputNames() const { return input_name_list_; @@ -69,6 +70,7 @@ class OpYamlInfoParser { std::map attr_info_; // output info + std::map output_name2id_; std::vector output_name_list_; std::map output_info_; From 6d46e06e1e3e3f14911bc1d1aeb39e27b9788e9f Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Tue, 18 Jul 2023 09:12:29 +0000 Subject: [PATCH 06/18] fix bug --- .../framework/new_executor/new_ir_interpreter.cc | 8 +++++++- .../ir/phi_kernel_adaptor/phi_kernel_adaptor.h | 3 +++ .../ir/phi_kernel_adaptor/phi_kernel_util.cc | 16 +++++++++++++++- .../ir/phi_kernel_adaptor/phi_kernel_util.h | 1 + 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index ba5141509e1d1c..3651146b38acba 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -185,8 +185,11 @@ FetchList NewIRInterpreter::Run(const std::vector& feed_names, if (!is_build_) { LOG_FIRST_N(INFO, 1) << "New Executor is Running."; + std::stringstream ss; + ss << this; ::ir::BuildScope(*ir_program_->block(), InnerScope(), + ss.str(), &value_2_var_name_, &variable_2_var_name_, &var_name_2_id_, @@ -247,8 +250,11 @@ FetchList NewIRInterpreter::BetaRun(const std::vector& feed_names, SetDeviceId(place_); if (!is_build_) { LOG_FIRST_N(INFO, 1) << "New Executor is BetaRunning."; + std::stringstream ss; + ss << this; ::ir::BuildScope(*ir_program_->block(), InnerScope(), + ss.str(), &value_2_var_name_, &variable_2_var_name_, &var_name_2_id_, @@ -1546,7 +1552,7 @@ void NewIRInterpreter::BuildInstruction() { ->attributes() .at("op_name") .dyn_cast<::ir::StrAttribute>() - .data(); + .AsString(); if (op_name == "builtin.combine" || op_name == "builtin.slice" || op_name == "pd.feed" || op_name == "pd.fetch" || op_name == "builtin.set_parameter" || diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h index 1466a580ff0141..24066abecc0434 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_adaptor.h @@ -60,9 +60,12 @@ class PhiKernelAdaptor { variable_2_var_name; std::map var_name_2_id; std::vector variable_list; + std::stringstream ss; + ss << this; BuildScope(*block, scope_, + ss.str(), &value_2_var_name, &variable_2_var_name, &var_name_2_id, diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index f1ad5042cd69f6..ad3a804eac9116 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -49,6 +49,7 @@ using VariableNameMap = paddle::framework::Variable* CreateVar( ir::Value value, paddle::framework::Scope* inner_scope, + const std::string& var_name_prefix, bool force_persisable, std::unordered_map* value_2_var_name, std::unordered_map* @@ -65,7 +66,8 @@ paddle::framework::Variable* CreateVar( } paddle::framework::Variable* var = nullptr; - std::string name = "inner_var_" + std::to_string(variable_2_var_name->size()); + std::string name = var_name_prefix + "_inner_var_" + + std::to_string(variable_2_var_name->size()); if (force_persisable || is_persisable) { VLOG(6) << "Create var: " << name << " in scope " << inner_scope->root(); var = const_cast(inner_scope->root())->Var(name); @@ -109,6 +111,7 @@ void CheckInputVars( void BuildValue(ir::Value value, paddle::framework::Scope* inner_scope, + const std::string& var_name_prefix, std::unordered_map* value_2_var_name, std::unordered_map* variable_2_var_name, @@ -120,6 +123,7 @@ void BuildValue(ir::Value value, } else { var = CreateVar(value, inner_scope, + var_name_prefix, false, value_2_var_name, variable_2_var_name, @@ -146,6 +150,7 @@ void BuildValue(ir::Value value, "DenseTensorType")); auto var_i = CreateVar(value, inner_scope, + var_name_prefix, false, value_2_var_name, variable_2_var_name, @@ -163,6 +168,7 @@ void BuildValue(ir::Value value, void HandleForSpecialOp( ir::Operation* op, paddle::framework::Scope* inner_scope, + const std::string& var_name_prefix, std::unordered_map* value_2_var_name, std::unordered_map* variable_2_var_name, @@ -189,6 +195,7 @@ void HandleForSpecialOp( auto value = op->result(0); auto var = CreateVar(value, inner_scope, + var_name_prefix, false, value_2_var_name, variable_2_var_name, @@ -217,6 +224,7 @@ void HandleForSpecialOp( } else { var = CreateVar(out_value, inner_scope, + var_name_prefix, false, value_2_var_name, variable_2_var_name, @@ -296,6 +304,7 @@ void HandleForSpecialOp( void HandleForInplaceOp( ir::Operation* op, paddle::framework::Scope* inner_scope, + const std::string& var_name_prefix, std::unordered_map* value_2_var_name, std::unordered_map* variable_2_var_name, @@ -328,6 +337,7 @@ void HandleForInplaceOp( } else { BuildValue(value, inner_scope, + var_name_prefix, value_2_var_name, variable_2_var_name, var_name_2_id, @@ -340,6 +350,7 @@ void HandleForInplaceOp( // created in inner_scope. void BuildScope(const ir::Block& block, paddle::framework::Scope* inner_scope, + const std::string& var_name_prefix, std::unordered_map* value_2_var_name, std::unordered_map* variable_2_var_name, @@ -367,6 +378,7 @@ void BuildScope(const ir::Block& block, op_name == "builtin.get_parameter" || op_name == "builtin.slice") { HandleForSpecialOp(op, inner_scope, + var_name_prefix, value_2_var_name, variable_2_var_name, var_name_2_id, @@ -384,6 +396,7 @@ void BuildScope(const ir::Block& block, .data()) { HandleForInplaceOp(op, inner_scope, + var_name_prefix, value_2_var_name, variable_2_var_name, var_name_2_id, @@ -393,6 +406,7 @@ void BuildScope(const ir::Block& block, for (size_t i = 0; i < op->num_results(); ++i) { BuildValue(op->result(i), inner_scope, + var_name_prefix, value_2_var_name, variable_2_var_name, var_name_2_id, diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h index 91a03206095bdd..08b9baceadfd55 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h @@ -43,6 +43,7 @@ namespace ir { void BuildScope(const ir::Block& block, paddle::framework::Scope* inner_scope, + const std::string& var_name_prefix, std::unordered_map* value_2_var_name, std::unordered_map* variable_2_var_name, From e9a6c919cfaa1670a2c912dc3ef730d8ff00a86c Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Tue, 18 Jul 2023 13:02:24 +0000 Subject: [PATCH 07/18] fix bug --- .../new_executor/instruction/phi_kernel_instruction.cc | 4 ++-- paddle/fluid/framework/new_executor/new_ir_interpreter.cc | 4 ++-- paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index cf4b65426655e2..50a8161cd4332a 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -256,10 +256,10 @@ void PhiKernelInstruction::InitInputsOutputsIds( } void PhiKernelInstruction::Run() { - VLOG(6) << "Run op " << phi_op_name_ << " infer meta."; infer_meta_interface_->infer_meta_(&(infer_meta_context_)); - VLOG(6) << "Run op " << phi_op_name_ << " kernel."; + VLOG(6) << "Run op " << phi_op_name_ << " infer meta."; (*(phi_kernel_))(&(kernel_context_)); + VLOG(6) << "Run op " << phi_op_name_ << " kernel."; } } // namespace framework diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 3651146b38acba..b55c58ba8cce28 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -198,8 +198,8 @@ FetchList NewIRInterpreter::Run(const std::vector& feed_names, // NOTE(zhangbo): Iterative version, gradually replacing BuildOpFuncList() // and Convert() - BuildInstruction(); - BuildInstructionDependences(); + // BuildInstruction(); + // BuildInstructionDependences(); std::vector op_func_nodes; interpreter::BuildOpFuncList(place_, diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index ad3a804eac9116..62fffc28e18f81 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -66,8 +66,8 @@ paddle::framework::Variable* CreateVar( } paddle::framework::Variable* var = nullptr; - std::string name = var_name_prefix + "_inner_var_" + - std::to_string(variable_2_var_name->size()); + VLOG(6) << "var_name_prefix is: " << var_name_prefix; + std::string name = "inner_var_" + std::to_string(variable_2_var_name->size()); if (force_persisable || is_persisable) { VLOG(6) << "Create var: " << name << " in scope " << inner_scope->root(); var = const_cast(inner_scope->root())->Var(name); From 9801a5f7b27bd465dd0e3551d0a045520d1fb904 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Wed, 19 Jul 2023 03:16:45 +0000 Subject: [PATCH 08/18] add var prefix --- .../ir/phi_kernel_adaptor/phi_kernel_util.cc | 4 +- .../standalone_executor_new_ir_test.cc | 37 +++++++++++++------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index 62fffc28e18f81..ad3a804eac9116 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -66,8 +66,8 @@ paddle::framework::Variable* CreateVar( } paddle::framework::Variable* var = nullptr; - VLOG(6) << "var_name_prefix is: " << var_name_prefix; - std::string name = "inner_var_" + std::to_string(variable_2_var_name->size()); + std::string name = var_name_prefix + "_inner_var_" + + std::to_string(variable_2_var_name->size()); if (force_persisable || is_persisable) { VLOG(6) << "Create var: " << name << " in scope " << inner_scope->root(); var = const_cast(inner_scope->root())->Var(name); diff --git a/test/cpp/new_executor/standalone_executor_new_ir_test.cc b/test/cpp/new_executor/standalone_executor_new_ir_test.cc index 4c52621190227e..b86731a1e5ba94 100644 --- a/test/cpp/new_executor/standalone_executor_new_ir_test.cc +++ b/test/cpp/new_executor/standalone_executor_new_ir_test.cc @@ -22,6 +22,7 @@ #include "paddle/phi/core/kernel_registry.h" +#include "paddle/fluid/framework/new_executor/new_ir_interpreter.h" #include "paddle/fluid/ir/dialect/pd_dialect.h" #include "paddle/fluid/ir/dialect/pd_op.h" #include "paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h" @@ -69,14 +70,23 @@ TEST(StandaloneExecutor, run) { ProgramDesc prog_desc; InterpreterCore test_core(place, std::move(kernel_program), &scope); + VLOG(0) << "&test_core" << &test_core; + VLOG(0) << "&test_core.impl" << test_core.Impl(); + VLOG(0) << "&test_core.impl.cast" + << reinterpret_cast( + const_cast(test_core.Impl())); test_core.BetaRun({}); - - auto out_tensor = test_core.local_scope() == nullptr - ? scope.FindVar("inner_var_2")->Get() - : test_core.local_scope() - ->FindVar("inner_var_2") - ->Get(); + std::stringstream os; + os << reinterpret_cast( + const_cast(test_core.Impl())); + std::string prefix_str = os.str(); + auto out_tensor = + test_core.local_scope() == nullptr + ? scope.FindVar(prefix_str + "_inner_var_2")->Get() + : test_core.local_scope() + ->FindVar(prefix_str + "_inner_var_2") + ->Get(); bool res0 = simple_cmp(out_tensor.data()[0], 2.0); bool res1 = simple_cmp(out_tensor.data()[1], 2.0); @@ -107,11 +117,16 @@ TEST(StandaloneExecutor, run_inplace_sqrt) { InterpreterCore test_core(place, std::move(kernel_program), &scope); test_core.BetaRun({}); - auto out_tensor = test_core.local_scope() == nullptr - ? scope.FindVar("inner_var_0")->Get() - : test_core.local_scope() - ->FindVar("inner_var_0") - ->Get(); + std::stringstream os; + os << reinterpret_cast( + const_cast(test_core.Impl())); + std::string prefix_str = os.str(); + auto out_tensor = + test_core.local_scope() == nullptr + ? scope.FindVar(prefix_str + "_inner_var_0")->Get() + : test_core.local_scope() + ->FindVar(prefix_str + "_inner_var_0") + ->Get(); bool res0 = simple_cmp(out_tensor.data()[0], 2.0); bool res1 = simple_cmp(out_tensor.data()[1], 2.0); From 9cd63623b689b40564c9ce2e625172e32a2cdf39 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Wed, 19 Jul 2023 12:19:09 +0000 Subject: [PATCH 09/18] add code --- .../instruction/instruction_base.h | 17 +-- .../instruction/phi_kernel_instruction.cc | 119 +++++++++++++++++- .../interpreter/dependency_builder.cc | 3 +- .../interpreter/dependency_builder.h | 11 +- .../interpreter/interpreter_util.cc | 23 ++++ .../interpreter/interpreter_util.h | 3 + .../interpreter/stream_analyzer.h | 50 ++++++++ .../new_executor/new_ir_interpreter.cc | 25 ++-- .../new_executor/new_ir_interpreter.h | 5 +- 9 files changed, 226 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index a31b65c1039d63..d06419fe3016f3 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -107,22 +107,23 @@ class InstructionBase { std::map& GetMutableInplaceBackMap() { return inplace_back_map_; } const std::map& GetInplaceBackMap() { return inplace_back_map_; } - const std::unordered_map>& Inputs() const { + const std::unordered_map<::ir::Value, std::vector>& Inputs() const { return input_index_; } - std::unordered_map>& GetMutableInputs() { + std::unordered_map<::ir::Value, std::vector>& GetMutableInputs() { return input_index_; } - void SetInputs(const std::unordered_map>& inputs); + void SetInputs( + const std::unordered_map<::ir::Value, std::vector>& inputs); - const std::unordered_map>& Outputs() const { + const std::unordered_map<::ir::Value, std::vector>& Outputs() const { return output_index_; } - std::unordered_map>& GetMutableOutputs() { + std::unordered_map<::ir::Value, std::vector>& GetMutableOutputs() { return output_index_; } void SetOutputs( - const std::unordered_map>& outputs); + const std::unordered_map<::ir::Value, std::vector>& outputs); virtual void Run() = 0; @@ -159,9 +160,9 @@ class InstructionBase { std::map inplace_back_map_; - std::unordered_map> input_index_; + std::unordered_map<::ir::Value, std::vector> input_index_; - std::unordered_map> output_index_; + std::unordered_map<::ir::Value, std::vector> output_index_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index 50a8161cd4332a..0aa08fa9f6ad54 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/ir/interface/op_yaml_info.h" #include "paddle/fluid/ir/interface/op_yaml_info_parser.h" #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/type_defs.h" @@ -32,6 +33,114 @@ namespace paddle { namespace framework { +using DeviceContext = paddle::platform::DeviceContext; + +class IrContextManager { + public: + using DeviceContextMap = + std::map>>; + + static IrContextManager& Instance() { + static IrContextManager* ctx_manager = new IrContextManager; + return *ctx_manager; + } + + std::shared_future> Get( + const std::string& type, + const platform::Place& place, + int stream_priority) { + std::lock_guard lk(ctx_mtx_); + VLOG(6) << "Get dev_ctx for " << type << " - " << place; + + DeviceContextMap& ctxs = ctx_pool_[type]; + if (ctxs.find(place) == ctxs.end()) { + platform::EmplaceDeviceContexts( + &ctxs, + {place}, + /*disable_setting_default_stream_for_allocator=*/true, + stream_priority); + } + return ctxs[place]; + } + + private: + IrContextManager() {} + DISABLE_COPY_AND_ASSIGN(IrContextManager); + + std::mutex ctx_mtx_; + std::unordered_map ctx_pool_; +}; + +platform::DeviceContext* ParseDeviceContext( + ir::Operation* op, + platform::DeviceContext* origin_dev_ctx, + const platform::Place& place, + const std::string& execution_stream, + const int stream_priority) { + auto op_attributes = op->attributes(); + auto op_name = + op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); + IrContextManager& ctx_manager = IrContextManager::Instance(); + + DeviceContext* dev_ctx = nullptr; + + // only gpu need update. xpu not need, because xpu memcpy op kernel is + // synchronous. + if (platform::is_gpu_place(place) || platform::is_custom_place(place)) { + VLOG(6) << "Parse DeviceContext for " << op_name + << ", execution stream = " << execution_stream; + if (execution_stream != kDefaultStream) { + dev_ctx = ctx_manager + .Get(std::string(kCustomStream) + "-" + execution_stream, + place, + stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } + + if (op_name == interpreter::kMemcpyD2H) { + dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } else if (op_name == interpreter::kMemcpyH2D) { + dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum + // with use_cal_stream==false by returning a device context getting from the + // global NCCLCommContext instance. Because when use_calc_stream==false, in + // OP kernel, the NCCL communication will be launched to the stream directly + // getting from the global NCCLCommContext instance rather than the + // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in + // c_allreduce_op.h). Now it is just a temporary solution for ONLY + // c_allreduce_sum which is used in ResNet50 distributed training. + if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream") + .dyn_cast<::ir::BoolAttribute>() + .data() == false) { + int ring_id = + op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); + return platform::NCCLCommContext::Instance() + .Get(ring_id, place) + ->dev_context(); + } +#endif + } + + if (origin_dev_ctx != nullptr) { + interpreter::SetDeviceCommContext(op, origin_dev_ctx); + } + return origin_dev_ctx; +} + OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) { if (platform::is_cpu_place(place)) { return OpFuncType::kCpuSync; @@ -172,9 +281,13 @@ PhiKernelInstruction::PhiKernelInstruction( kernel_context_.SetDeviceContext(phi::DeviceContextPool::Instance().Get( phi::TransToPhiPlace(kernel_key.backend()))); VLOG(6) << "finish process kernel context"; - - SetDeviceContext(phi::DeviceContextPool::Instance().Get( - phi::TransToPhiPlace(kernel_key.backend()))); + SetDeviceContext( + ParseDeviceContext(op, + phi::DeviceContextPool::Instance().Get( + phi::TransToPhiPlace(kernel_key.backend())), + place, + GetExecutionStream(), + GetStreamPriority())); VLOG(6) << "finish process device context"; Scope* inner_scope = local_scope == nullptr ? scope : local_scope; diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 559e8d7afa6c74..8c5de5f6ed20c2 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -532,8 +532,7 @@ void DependencyBuilder::ShrinkDownstreamMap() { /// For new ir /// /// ======================== /// const std::map>& IrDependencyBuilder::Build( - const std::vector>& - instructions) { + const std::vector& instructions) { if (is_build_) { return op_downstream_map_; } diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h index eb65c23f4a6527..d134981b655a36 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h @@ -85,9 +85,9 @@ class DependencyBuilder { std::vector> op_happens_before_; }; -// /// ======================== /// -// /// For new ir /// -// /// ======================== /// +/// ======================== /// +/// For new ir /// +/// ======================== /// class IrDependencyBuilder { public: IrDependencyBuilder() : is_build_(false), instructions_(nullptr) {} @@ -95,8 +95,7 @@ class IrDependencyBuilder { // build op dependencies and return the mapping from op to its downstream-op // set const std::map>& Build( - const std::vector>& - instructions); + const std::vector& instructions); const std::map>& OpDownstreamMap() const; @@ -122,7 +121,7 @@ class IrDependencyBuilder { void ShrinkDownstreamMap(); bool is_build_; - const std::vector>* + const std::vector* instructions_; // not_own size_t op_num_; diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 70be3b9dd035a3..37ca391b8941a2 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -1104,6 +1104,29 @@ void SetDeviceCommContext(framework::OperatorBase* operator_base, } } +void SetDeviceCommContext(::ir::Operation* op, + platform::DeviceContext* dev_ctx) { + auto op_attributes = op->attributes(); + if (op_attributes.count("ring_id") != 0) { + int ring_id = + op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); + const auto& comm_context_manager = + phi::distributed::CommContextManager::GetInstance(); + if (comm_context_manager.Has(ring_id)) { + auto comm_context = comm_context_manager.Get(ring_id); + if (!dev_ctx->GetCommContext()) { + dev_ctx->SetCommContext(comm_context); + } + } else { + VLOG(3) << "op: " + << op_attributes.at("op_name") + .dyn_cast<::ir::StrAttribute>() + .AsString() + << ", ring_id: " << ring_id << ", get comm_context failed!"; + } + } +} + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index eb87c8bcb4ce33..a2fd08f957cda0 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -110,6 +110,9 @@ void LogDeviceMemoryStats(const platform::Place& place); void SetDeviceCommContext(framework::OperatorBase* operator_base, platform::DeviceContext* dev_ctx); + +void SetDeviceCommContext(::ir::Operation* op, + platform::DeviceContext* dev_ctx); } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h index 1dd13c90da3311..c548f0cc819511 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h @@ -78,6 +78,56 @@ class StreamAnalyzer { const Place place_; }; +/// ======================== /// +/// For new ir /// +/// ======================== /// +class IrStreamAnalyzer { + public: + using DeviceContext = platform::DeviceContext; + using Place = platform::Place; + + explicit IrStreamAnalyzer(const Place& place) : place_(place) {} + + ~IrStreamAnalyzer() {} + + void ConstructEvents(std::vector* instructions) const; + + platform::DeviceType GetWaiterType(const Instruction& instr) const; + + private: + bool HasDataDependency(const Instruction& cur_instr, + const Instruction& next_instr) const; + + void AnalyseAllEventInfo( + const std::vector& instructions, + const std::vector>>& run_type_info, + std::map>>* + event_info) const; + + void AnalyseAllRunType( + const std::vector& instructions, + const std::map>& downstream_map, + std::vector>>* run_type_info) const; + + void AnalyseEventInfoForTwoInstructions( + const std::vector& instructions, + const std::vector>>& run_type_info, + const size_t cur_instr_id, + const size_t next_instr_id, + std::set* waiter_instr_ids, + std::set* visited_next_instr_id) const; + + void ShrinkEventInfo( + const DependencyBuilder& dependency_builder, + std::map>>* + event_info_map) const; + + DownstreamRunType AnalyseRunTypeForTwoInstructions( + const Instruction& cur_instr, const Instruction& next_instr) const; + + const Place place_; +}; + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index b55c58ba8cce28..52ca442576a1ad 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -51,7 +51,8 @@ NewIRInterpreter::NewIRInterpreter(const platform::Place& place, execution_config_(execution_config), var_scope_(scope), scope_(scope), - ir_program_(std::move(ir_prog)) { + ir_program_(std::move(ir_prog)), + ir_stream_analyzer_(place) { VLOG(4) << "NewIRInterpreter(): " << this << " on " << place_; static_build_ = FLAGS_new_executor_static_build && !FLAGS_new_executor_use_cuda_graph && @@ -98,6 +99,10 @@ NewIRInterpreter::~NewIRInterpreter() { async_work_queue_.reset(); VLOG(4) << "~NewIRInterpreter(): " << this << " on " << place_; + for (InstructionBase* instr : vec_instruction_base_) { + delete instr; + } + #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working @@ -1561,14 +1566,14 @@ void NewIRInterpreter::BuildInstruction() { continue; } vec_instruction_base_.emplace_back( - std::make_unique(op_idx++, - place_, - (*it), - scope_, - local_scope_, - value_2_var_name_, - var_name_2_id_, - variable_2_var_name_)); + new PhiKernelInstruction(op_idx++, + place_, + (*it), + scope_, + local_scope_, + value_2_var_name_, + var_name_2_id_, + variable_2_var_name_)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Now only support pd_kernel dialect.")); @@ -1597,7 +1602,7 @@ void NewIRInterpreter::BuildInstructionDependences() { auto downstream_map = ir_dependency_builder_.Build(vec_instruction_base_); for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { - InstructionBase* cur_instr = vec_instruction_base_[instr_id].get(); + InstructionBase* cur_instr = vec_instruction_base_[instr_id]; const std::set& next_instr_ids = downstream_map[instr_id]; if (FLAGS_new_executor_serial_run) { diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index dfff64e35843f3..5ab06635f7a8d7 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -194,7 +194,8 @@ class NewIRInterpreter : public InterpreterBaseImpl { std::unique_ptr<::ir::Program> ir_program_{nullptr}; - std::vector> vec_instruction_base_; + // std::vector> vec_instruction_base_; + std::vector vec_instruction_base_; std::unordered_map<::ir::Value, std::string> value_2_var_name_; @@ -206,6 +207,8 @@ class NewIRInterpreter : public InterpreterBaseImpl { std::vector variable_list_; interpreter::IrDependencyBuilder ir_dependency_builder_; + + interpreter::IrStreamAnalyzer ir_stream_analyzer_; }; } // namespace framework From 422e02db3416dfd91210afc51a676de94a5d9874 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 20 Jul 2023 09:51:55 +0000 Subject: [PATCH 10/18] add code --- .../instruction/instruction_base.h | 2 + .../interpreter/dependency_builder.cc | 10 + .../interpreter/interpreter_util.cc | 9 + .../interpreter/interpreter_util.h | 5 + .../interpreter/stream_analyzer.cc | 351 ++++++++++++++++++ .../interpreter/stream_analyzer.h | 34 +- .../new_executor/new_ir_interpreter.cc | 14 +- .../new_executor/new_ir_interpreter.h | 1 - 8 files changed, 405 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index d06419fe3016f3..686503e179add2 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -163,6 +163,8 @@ class InstructionBase { std::unordered_map<::ir::Value, std::vector> input_index_; std::unordered_map<::ir::Value, std::vector> output_index_; + + std::unordered_set<::ir::Value> no_need_buffer_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 8c5de5f6ed20c2..18c257610e9f08 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -768,6 +768,16 @@ void IrDependencyBuilder::AddDependencyForSequentialRun() { } } +const std::map>& IrDependencyBuilder::OpDownstreamMap() + const { + PADDLE_ENFORCE_EQ( + is_build_, + true, + phi::errors::Unavailable( + "DependencyBuilder is not yet built, call Build() firstly.")); + return op_downstream_map_; +} + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 37ca391b8941a2..991775f0b5f602 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/distributed/auto_parallel/dist_attr.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h" #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h" #include "paddle/fluid/framework/new_executor/interpreter/static_build.h" @@ -156,6 +157,10 @@ bool IsCpuOp(const Instruction& instr) { return platform::is_cpu_place(instr.DeviceContext().GetPlace()); } +bool IsCpuOp(const paddle::framework::InstructionBase* instr) { + return platform::is_cpu_place(instr->DeviceContext().GetPlace()); +} + bool IsGradOp(const std::string& op_name) { return paddle::string::ends_with(op_name, "_grad"); } @@ -173,6 +178,10 @@ bool IsMemcpyH2D(const Instruction& instr) { return instr.OpBase()->Type() == kMemcpyH2D; } +bool IsMemcpyH2D(const paddle::framework::InstructionBase* instr) { + return instr->Name() == "pd.memcpy_h2d"; +} + bool IsMemcpyOp(const Instruction& instr) { return IsMemcpyD2H(instr) || IsMemcpyH2D(instr); } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index a2fd08f957cda0..e31e60ed6e6601 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -43,6 +43,7 @@ using AtomicVectorSizeT = std::vector>; namespace paddle { namespace framework { +class InstructionBase; namespace interpreter { class AsyncWorkQueue { public: @@ -71,12 +72,16 @@ bool IsCommunicationOp(const Instruction& instr); bool IsCpuOp(const Instruction& instr); +bool IsCpuOp(const paddle::framework::InstructionBase* instr); + bool IsGradOp(const std::string& op_name); bool IsMemcpyD2H(const Instruction& instr); bool IsMemcpyH2D(const Instruction& instr); +bool IsMemcpyH2D(const paddle::framework::InstructionBase* instr); + bool IsMemcpyOp(const Instruction& instr); bool IsSupportedHeterPlace(const phi::Place& place); diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index bba4e0accc3923..8593ea12af98f2 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device_context.h" @@ -481,6 +482,356 @@ DownstreamRunType StreamAnalyzer::AnalyseRunTypeForTwoInstructions( return DownstreamRunType::kDirectRun; } +/// ======================== /// +/// For new ir /// +/// ======================== /// +void IrStreamAnalyzer::ConstructEvents( + std::vector* instructions) const { + std::vector + cross_step_merged_instructions = *instructions; + for (paddle::framework::InstructionBase* instr : *instructions) { + cross_step_merged_instructions.emplace_back(instr); + } + + IrDependencyBuilder dependency_builder; + dependency_builder.Build(cross_step_merged_instructions); + + const std::map>& downstream_map = + dependency_builder.OpDownstreamMap(); + const size_t instr_num = cross_step_merged_instructions.size(); + std::vector>> run_type_info( + instr_num, + std::vector>( + /*number_of_run_type = */ 2)); // instr_id -> run_type -> + // next_instr_id + AnalyseAllRunType( + cross_step_merged_instructions, downstream_map, &run_type_info); + + std::map>> + event_info; // DeviceContext -> waiter_instr_id -> recorder_instr_ids + AnalyseAllEventInfo( + cross_step_merged_instructions, run_type_info, &event_info); + ShrinkEventInfo(dependency_builder, &event_info); + + // Construct events + std::map> instr2event; + for (auto& context_item : event_info) { + for (auto& waiter_item : context_item.second) { + size_t waiter_instr_id = waiter_item.first; + std::set& recorder_instr_ids = waiter_item.second; + + if (waiter_instr_id >= instructions->size()) { + waiter_instr_id -= instructions->size(); + } + + for (size_t recorder_instr_id : recorder_instr_ids) { + // Redundant record + if (recorder_instr_id >= instructions->size()) { + continue; + } + + paddle::framework::InstructionBase* recorder_instr = + instructions->at(recorder_instr_id); + paddle::framework::InstructionBase* waiter_instr = + instructions->at(waiter_instr_id); + platform::DeviceType waiter_type = GetWaiterType(waiter_instr); + + if (instr2event.find(recorder_instr_id) == instr2event.end()) { + std::shared_ptr device_event = + std::make_shared( + recorder_instr->DeviceContext().GetPlace(), + platform::GenerateDeviceEventFlag()); + recorder_instr->AddEventToRecord(device_event, + platform::kCUDA /*unused*/); + instr2event.emplace(recorder_instr_id, device_event); + } + + waiter_instr->AddEventToWait( + recorder_instr_id, instr2event.at(recorder_instr_id), waiter_type); + VLOG(6) << "Add event: " << recorder_instr->Name() << "(" + << recorder_instr_id << ") -> " << waiter_instr->Name() << "(" + << waiter_instr_id << "), waiter type = " << waiter_type; + } + } + } +} + +platform::DeviceType IrStreamAnalyzer::GetWaiterType( + const paddle::framework::InstructionBase* instr) const { + if (instr->KernelType() == OpFuncType::kCpuSync) { + return platform::kCPU; + } else { + if (platform::is_xpu_place(place_)) { + return platform::kXPU; + } else if (platform::is_custom_place(place_)) { + return platform::kCUSTOM_DEVICE; + } + return platform::kCUDA; + } +} + +void IrStreamAnalyzer::AnalyseAllRunType( + const std::vector& instructions, + const std::map>& downstream_map, + std::vector>>* run_type_info) const { + for (auto& item : downstream_map) { + size_t cur_instr_id = item.first; + const paddle::framework::InstructionBase* cur_instr = + instructions[item.first]; + for (size_t next_instr_id : item.second) { + const paddle::framework::InstructionBase* next_instr = + instructions[next_instr_id]; + DownstreamRunType run_type = + AnalyseRunTypeForTwoInstructions(cur_instr, next_instr); + + (*run_type_info)[cur_instr_id][run_type].push_back(next_instr_id); + + VLOG(6) << RunTypeToString(run_type) << ": " << cur_instr->Name() << "(" + << cur_instr_id << ") -> " << next_instr->Name() << "(" + << next_instr_id << ")"; + } + } +} + +DownstreamRunType IrStreamAnalyzer::AnalyseRunTypeForTwoInstructions( + const paddle::framework::InstructionBase* cur_instr, + const paddle::framework::InstructionBase* next_instr) const { + // xpu&ipu memcpy kerenl is synchronous. + if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_)) { + return DownstreamRunType::kDirectRun; + } + + // npu d2h kernel is asynchronous. + if (platform::is_custom_place(place_)) { + if (interpreter::IsCpuOp(cur_instr) || + interpreter::IsMemcpyH2D(next_instr)) { + return DownstreamRunType::kDirectRun; + } + } + + if (cur_instr->KernelType() == OpFuncType::kGpuAsync && + (&cur_instr->DeviceContext() != &next_instr->DeviceContext())) { + return DownstreamRunType::kEventRun; + } + + return DownstreamRunType::kDirectRun; +} + +void IrStreamAnalyzer::AnalyseAllEventInfo( + const std::vector& instructions, + const std::vector>>& run_type_info, + std::map>>* + event_info) const { + for (size_t cur_instr_id = 0; cur_instr_id < instructions.size(); + ++cur_instr_id) { + const std::vector& next_instr_ids = + run_type_info[cur_instr_id][DownstreamRunType::kEventRun]; + std::set waiter_instr_ids; + std::set visited_next_instr_id; + + for (size_t next_instr_id : next_instr_ids) { + AnalyseEventInfoForTwoInstructions(instructions, + run_type_info, + cur_instr_id, + next_instr_id, + &waiter_instr_ids, + &visited_next_instr_id); + } + + for (size_t waiter_instr_id : waiter_instr_ids) { + (*event_info)[&(instructions[cur_instr_id]->DeviceContext())] + [waiter_instr_id] + .insert(cur_instr_id); + } + } +} + +// The caller should guarantee cur_instr and next_instr is kEventRun +void IrStreamAnalyzer::AnalyseEventInfoForTwoInstructions( + const std::vector& instructions, + const std::vector>>& run_type_info, + const size_t cur_instr_id, + const size_t next_instr_id, + std::set* waiter_instr_ids, + std::set* visited_next_instr_id) const { + if (visited_next_instr_id->find(next_instr_id) != + visited_next_instr_id->end()) { + return; + } + visited_next_instr_id->insert(next_instr_id); + + // NOTE(Ruibiao): Though depend_op as next_instr is no_need_buffer, we should + // also wait event for it. Because depend_op is used to build dependencies for + // fused vars in some scenarios. In those cases, we do not know which vars may + // lead a implicit data dependency. For example, + // ### + // ### fused_var = fuse_op(var0, ...) + // ### var1 = op1(fused_var) + // ### var0 = depend_op(var0, fused_var) + // ### var2 = op2(var0) + // ### + // If op1 are cross-stream with depend_op and op2, then we have: + // ### + // ### event_run : op1 -> depend_op + // ### direct_run : depend_op -> op2 + // ### + // There is actually a data dependency between op1 and op2 that var0 and + // fused_var share the same tensor. However, as the dependency is implicit, we + // can only add event for it with the help of depend_op. + if (HasDataDependency(instructions[cur_instr_id], + instructions[next_instr_id]) || + !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() || + instructions[next_instr_id]->Name() == "depend") { + waiter_instr_ids->insert(next_instr_id); + return; + } + + // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and + // simultaneously next_instr has no event_run downstream instr, we try to + // recursively add events between cur_instr and next_instr's + // direct-run-instrs. This can delay the event wait and achieve better + // scheduling performance in some scenarios. However, when next_instr has too + // many direct-run-instrs, it may perform worse than add event directly + // between cur_instr and next_instr. + for (size_t instr_id : + run_type_info[next_instr_id][DownstreamRunType::kDirectRun]) { + AnalyseEventInfoForTwoInstructions(instructions, + run_type_info, + cur_instr_id, + instr_id, + waiter_instr_ids, + visited_next_instr_id); + } +} + +bool IrStreamAnalyzer::HasDataDependency( + const paddle::framework::InstructionBase* cur_instr, + const paddle::framework::InstructionBase* next_instr) const { + auto no_need_buffer_ins = [](const paddle::framework::InstructionBase* instr) + -> const std::unordered_set { + // auto* op = instr.OpBase(); + // auto& inferer = op->Info().NoNeedBufferVarsInferer(); + // if (inferer) { + // return inferer(op->Inputs(), op->Outputs(), op->Attrs()); + // } + // TODO(zhangbo9674): Get NoNeedBufferValue set + return std::unordered_set(); + }; + + // cur_instr->var->next_instr + std::unordered_set cur_var_ids; + for (auto& item : cur_instr->Outputs()) { + cur_var_ids.insert(item.second.begin(), item.second.end()); + } + + const std::unordered_set next_instr_no_need_buffer_ins = + no_need_buffer_ins(next_instr); + + for (auto& item : next_instr->Inputs()) { + if (next_instr_no_need_buffer_ins.find(item.first) != + next_instr_no_need_buffer_ins.end()) { + continue; + } + for (auto next_var_id : item.second) { + if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { + VLOG(6) << "Found data dependency from " << cur_instr->Name() << "(" + << cur_instr->Id() << ") to " << next_instr->Name() << "(" + << next_instr->Id() << ") at variable " << item.first.impl() + << "(" << next_var_id << ")"; + return true; + } + } + } + + // cur_instr->var && next_instr->var + // var->cur_instr && next_instr->var + const std::unordered_set cur_instr_no_need_buffer_ins = + no_need_buffer_ins(cur_instr); + for (auto& item : cur_instr->Inputs()) { + if (cur_instr_no_need_buffer_ins.find(item.first) == + cur_instr_no_need_buffer_ins.end()) { + cur_var_ids.insert(item.second.begin(), item.second.end()); + } + } + + for (auto& item : next_instr->Outputs()) { + for (auto next_var_id : item.second) { + if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { + VLOG(6) << "Found data dependency from " << cur_instr->Name() << "(" + << cur_instr->Id() << ") to " << next_instr->Name() << "(" + << next_instr->Id() << ") at variable " << item.first.impl() + << "(" << next_var_id << ")"; + return true; + } + } + } + + return false; +} + +void IrStreamAnalyzer::ShrinkEventInfo( + const IrDependencyBuilder& dependency_builder, + std::map>>* + event_info) const { + for (auto& item : *event_info) { + // shrink redundant recorders, waiter instrs should only wait for the last + // recorder instrs in each stream + std::map>& waiter_recorder_map = item.second; + for (auto& waiter_recorder : waiter_recorder_map) { + size_t waiter_instr_id = waiter_recorder.first; + std::set& recorder_instr_ids = waiter_recorder.second; + std::set unnecessary_recorder_instr_ids; + for (size_t cur_instr_id : recorder_instr_ids) { + for (size_t next_instr_id : recorder_instr_ids) { + if (dependency_builder.OpHappensBefore(cur_instr_id, next_instr_id)) { + unnecessary_recorder_instr_ids.insert(cur_instr_id); + break; + } + } + } + + for (size_t unnecessary_recorder_instr_id : + unnecessary_recorder_instr_ids) { + VLOG(8) << "Shrink event : " << unnecessary_recorder_instr_id << " -> " + << waiter_instr_id; + recorder_instr_ids.erase(unnecessary_recorder_instr_id); + } + } + + // shrink redundant waiters, recorder instrs should only wait by the first + // waiter instrs in each stream + std::map> recorder_waiter_map; + for (auto& waiter_recorder : waiter_recorder_map) { + size_t waiter_instr_id = waiter_recorder.first; + std::set& recorder_instr_ids = waiter_recorder.second; + for (size_t record_instr_id : recorder_instr_ids) { + recorder_waiter_map[record_instr_id].insert(waiter_instr_id); + } + } + + for (auto& recorder_waiter : recorder_waiter_map) { + size_t recorder_instr_id = recorder_waiter.first; + std::set& waiter_instr_ids = recorder_waiter.second; + std::set unnecessary_waiter_instr_ids; + for (size_t cur_instr_id : waiter_instr_ids) { + for (size_t next_instr_id : waiter_instr_ids) { + if (dependency_builder.OpHappensBefore(cur_instr_id, next_instr_id)) { + unnecessary_waiter_instr_ids.insert(next_instr_id); + break; + } + } + } + + for (size_t unnecessary_wiater_instr_id : unnecessary_waiter_instr_ids) { + VLOG(8) << "Shrink event : " << recorder_instr_id << " -> " + << unnecessary_wiater_instr_id; + waiter_recorder_map[unnecessary_wiater_instr_id].erase( + recorder_instr_id); + } + } + } +} + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h index c548f0cc819511..5ea0fbee9687ac 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h @@ -90,41 +90,45 @@ class IrStreamAnalyzer { ~IrStreamAnalyzer() {} - void ConstructEvents(std::vector* instructions) const; + void ConstructEvents( + std::vector* instructions) const; - platform::DeviceType GetWaiterType(const Instruction& instr) const; + platform::DeviceType GetWaiterType( + const paddle::framework::InstructionBase* instr) const; private: - bool HasDataDependency(const Instruction& cur_instr, - const Instruction& next_instr) const; + void AnalyseAllRunType( + const std::vector& instructions, + const std::map>& downstream_map, + std::vector>>* run_type_info) const; + + DownstreamRunType AnalyseRunTypeForTwoInstructions( + const paddle::framework::InstructionBase* cur_instr, + const paddle::framework::InstructionBase* next_instr) const; void AnalyseAllEventInfo( - const std::vector& instructions, + const std::vector& instructions, const std::vector>>& run_type_info, std::map>>* event_info) const; - void AnalyseAllRunType( - const std::vector& instructions, - const std::map>& downstream_map, - std::vector>>* run_type_info) const; - void AnalyseEventInfoForTwoInstructions( - const std::vector& instructions, + const std::vector& instructions, const std::vector>>& run_type_info, const size_t cur_instr_id, const size_t next_instr_id, std::set* waiter_instr_ids, std::set* visited_next_instr_id) const; + bool HasDataDependency( + const paddle::framework::InstructionBase* cur_instr, + const paddle::framework::InstructionBase* next_instr) const; + void ShrinkEventInfo( - const DependencyBuilder& dependency_builder, + const IrDependencyBuilder& dependency_builder, std::map>>* event_info_map) const; - DownstreamRunType AnalyseRunTypeForTwoInstructions( - const Instruction& cur_instr, const Instruction& next_instr) const; - const Place place_; }; diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 52ca442576a1ad..84033c3a1f15db 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -202,9 +202,11 @@ FetchList NewIRInterpreter::Run(const std::vector& feed_names, VLOG(4) << DebugValueInfo(); // NOTE(zhangbo): Iterative version, gradually replacing BuildOpFuncList() - // and Convert() - // BuildInstruction(); - // BuildInstructionDependences(); + // and Convert() by: + // [1] BuildInstruction(); + // [2] BuildInstructionDependences(); + // [3] ir_stream_analyzer_.ConstructEvents(&vec_instruction_base_); + // [4] GC(); std::vector op_func_nodes; interpreter::BuildOpFuncList(place_, @@ -267,6 +269,8 @@ FetchList NewIRInterpreter::BetaRun(const std::vector& feed_names, VLOG(4) << DebugValueInfo(); BuildInstruction(); BuildInstructionDependences(); + ir_stream_analyzer_.ConstructEvents(&vec_instruction_base_); + for (size_t instr_id = 0; instr_id < vec_instruction_base_.size(); ++instr_id) { vec_instruction_base_[instr_id]->Run(); @@ -1545,13 +1549,13 @@ void NewIRInterpreter::AnalyseExecuteOrderForTrace() { /// ======================== /// void NewIRInterpreter::BuildInstruction() { - VLOG(0) << "Build Instructions for new ir ... "; + VLOG(6) << "Build Instructions for new ir ... "; vec_instruction_base_.clear(); size_t op_idx = 0; for (auto it = ir_program_->block()->begin(); it != ir_program_->block()->end(); ++it) { - VLOG(0) << "Build Instruction for op: " << op_idx; + VLOG(6) << "Build Instruction for op: " << op_idx; if ((*it)->dialect()->name() == "pd_kernel") { auto op_name = (*it) ->attributes() diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index 5ab06635f7a8d7..34039f3662d527 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -194,7 +194,6 @@ class NewIRInterpreter : public InterpreterBaseImpl { std::unique_ptr<::ir::Program> ir_program_{nullptr}; - // std::vector> vec_instruction_base_; std::vector vec_instruction_base_; std::unordered_map<::ir::Value, std::string> value_2_var_name_; From 0c6899799ad6e25b31f0ed898a49ed6345fc295b Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Tue, 25 Jul 2023 05:11:22 +0000 Subject: [PATCH 11/18] add code --- .../instruction/instruction_base.h | 7 ++++ .../instruction/phi_kernel_instruction.cc | 8 ++++ .../interpreter/stream_analyzer.cc | 10 +---- .../new_executor/new_ir_interpreter.cc | 40 +++++++++++++++++++ .../new_executor/new_ir_interpreter.h | 2 + .../fluid/ir/interface/op_yaml_info_parser.cc | 13 ++++-- .../fluid/ir/interface/op_yaml_info_parser.h | 5 +++ 7 files changed, 72 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index 686503e179add2..e046d65d1de54e 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -125,6 +125,13 @@ class InstructionBase { void SetOutputs( const std::unordered_map<::ir::Value, std::vector>& outputs); + const std::unordered_set<::ir::Value>& NoNeedBuffer() const { + return no_need_buffer_; + } + void SetNoNeedBuffer(const std::unordered_set<::ir::Value>& no_need_buffer) { + no_need_buffer_ = no_need_buffer; + } + virtual void Run() = 0; virtual const std::string& Name() const = 0; diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index 0aa08fa9f6ad54..35370bad006443 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -294,6 +294,14 @@ PhiKernelInstruction::PhiKernelInstruction( InitInputsOutputsIds( op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name); VLOG(6) << "finish process inputs outputs index"; + + auto no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds(); + std::unordered_set<::ir::Value> no_need_buffer_values; + for (size_t id = 0; id < no_need_buffer_ids.size(); id++) { + no_need_buffer_values.insert(op->operand(id)); + } + SetNoNeedBuffer(no_need_buffer_values); + VLOG(6) << "finish process no need buffer"; } std::vector GetValueIds( diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 8593ea12af98f2..ef2e6af636bdff 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -708,15 +708,7 @@ bool IrStreamAnalyzer::HasDataDependency( const paddle::framework::InstructionBase* cur_instr, const paddle::framework::InstructionBase* next_instr) const { auto no_need_buffer_ins = [](const paddle::framework::InstructionBase* instr) - -> const std::unordered_set { - // auto* op = instr.OpBase(); - // auto& inferer = op->Info().NoNeedBufferVarsInferer(); - // if (inferer) { - // return inferer(op->Inputs(), op->Outputs(), op->Attrs()); - // } - // TODO(zhangbo9674): Get NoNeedBufferValue set - return std::unordered_set(); - }; + -> const std::unordered_set { return instr->NoNeedBuffer(); }; // cur_instr->var->next_instr std::unordered_set cur_var_ids; diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 78eeaf4b5929a1..979914c40a576e 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -267,9 +267,34 @@ FetchList NewIRInterpreter::BetaRun(const std::vector& feed_names, &var_name_2_id_, &variable_list_); VLOG(4) << DebugValueInfo(); + BuildInstruction(); + BuildInstructionDependences(); + ir_stream_analyzer_.ConstructEvents(&vec_instruction_base_); + // add event for the input var of jit program, since there are async copied + // from gpu_pinned place to gpu place on compute stream. + for (size_t i = 0; i < dependecy_count_.size(); ++i) { + if (dependecy_count_[i] == 0) { + InstructionBase* inst = vec_instruction_base_[i]; + if (inst->Name() == "pd.memcpy_d2h" && platform::is_gpu_place(place_)) { + for (auto& item : inst->Inputs()) { + for (auto var_id : item.second) { + auto name = GetNameById(var_id); + if (JitInputVars().count(name)) { + auto device_event = std::make_shared( + place_, platform::GenerateDeviceEventFlag()); + VLOG(4) << "Add input event for input: " << name << " of " + << inst->Name(); + inst->AddEventToWait( + i, device_event, ir_stream_analyzer_.GetWaiterType(inst)); + } + } + } + } + } + } for (size_t instr_id = 0; instr_id < vec_instruction_base_.size(); ++instr_id) { @@ -354,6 +379,21 @@ void NewIRInterpreter::reset_scope(Scope* new_scope) { const Scope* NewIRInterpreter::local_scope() const { return local_scope_; } +std::string NewIRInterpreter::GetNameById(int id) const { + // NOTE(zhiqiu): do not use vec_meta_info_[id].vardesc_->Name() since + // vec_meta_info_[id] may be nullptr, + // typically when the target variable is not existed in the original program + // desc, but created by interpretercore. + // For example, created and used by d2h_copy or h2d_copy operator. + auto it = std::find_if(var_name_2_id_.begin(), + var_name_2_id_.end(), + [id](const auto& pair) { return pair.second == id; }); + if (it != var_name_2_id_.end()) { + return it->first; + } + return ""; +} + void NewIRInterpreter::ShareWorkQueueFrom(InterpreterBaseImpl* src) { async_work_queue_ = reinterpret_cast(src)->GetWorkQueue(); VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << src diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index 5c938d8b7f2ccc..203f959a43d161 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -80,6 +80,8 @@ class NewIRInterpreter : public InterpreterBaseImpl { hookfuncs_ = hookfuncs; } + std::string GetNameById(int id) const; + private: // build graph void Convert(std::vector* op_func_nodes); diff --git a/paddle/fluid/ir/interface/op_yaml_info_parser.cc b/paddle/fluid/ir/interface/op_yaml_info_parser.cc index bf2c8329f17cb2..68ed2fff7030fb 100644 --- a/paddle/fluid/ir/interface/op_yaml_info_parser.cc +++ b/paddle/fluid/ir/interface/op_yaml_info_parser.cc @@ -92,6 +92,10 @@ const std::map& OpYamlInfoParser::OutputName2Id() const { return input_name2id_; } +const std::vector& OpYamlInfoParser::NoNeedBufferIds() const { + return no_need_buffer_ids_; +} + bool OpYamlInfoParser::HasInplace(const std::string& out_name) const { auto& inplace_info = std::get<3>(op_info_tuple_).inplace; for (size_t i = 0; i < inplace_info.size(); i++) { @@ -117,14 +121,16 @@ const std::string& OpYamlInfoParser::InplaceName( void OpYamlInfoParser::parse() { auto input_info = std::get<0>(op_info_tuple_); - int input_start_index = 0; for (size_t i = 0; i < input_info.size(); ++i) { - input_name2id_[input_info[i].name] = input_start_index++; + input_name2id_[input_info[i].name] = i; input_name_list_.push_back(input_info[i].name); input_info_[input_info[i].name] = input_info[i]; if (!input_info[i].is_mutable_attribute) { input_tensor_number_++; } + if (input_info[i].no_need_buffer) { + no_need_buffer_ids_.push_back(i); + } } auto attribute_info = std::get<1>(op_info_tuple_); @@ -133,10 +139,9 @@ void OpYamlInfoParser::parse() { attr_info_[attribute_info[i].name] = attribute_info[i]; } - int output_start_index = 0; auto output_info = std::get<2>(op_info_tuple_); for (size_t i = 0; i < output_info.size(); ++i) { - output_name2id_[output_info[i].name] = output_start_index++; + output_name2id_[output_info[i].name] = i; output_name_list_.push_back(output_info[i].name); output_info_[output_info[i].name] = output_info[i]; } diff --git a/paddle/fluid/ir/interface/op_yaml_info_parser.h b/paddle/fluid/ir/interface/op_yaml_info_parser.h index 6b600a6d70e812..356decadcf677f 100644 --- a/paddle/fluid/ir/interface/op_yaml_info_parser.h +++ b/paddle/fluid/ir/interface/op_yaml_info_parser.h @@ -37,6 +37,8 @@ class OpYamlInfoParser { const std::map& InputName2Id() const; const std::map& OutputName2Id() const; + const std::vector& NoNeedBufferIds() const; + const std::vector& InputNames() const { return input_name_list_; } @@ -65,6 +67,9 @@ class OpYamlInfoParser { std::map input_info_; int input_tensor_number_{0}; + // no_need_buffer_ids + std::vector no_need_buffer_ids_; + // attribute info std::vector attribute_name_list_; std::map attr_info_; From 9ec29de98065aa79ca78ae34a2126d3a2675dc94 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Tue, 25 Jul 2023 06:28:33 +0000 Subject: [PATCH 12/18] fix compile bug --- .../framework/new_executor/instruction/phi_kernel_instruction.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index 35370bad006443..8ee66bfa1cea28 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/ir/interface/op_yaml_info.h" #include "paddle/fluid/ir/interface/op_yaml_info_parser.h" #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" +#include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" From 6875620f0a88a107956749f8bfa4a3f1a3e85a55 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 27 Jul 2023 01:54:36 +0000 Subject: [PATCH 13/18] fix bug --- .../new_executor/instruction/instruction_base.h | 9 +++++---- .../new_executor/instruction/phi_kernel_instruction.cc | 4 ++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index e046d65d1de54e..a893ec67d9fe97 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -126,10 +126,11 @@ class InstructionBase { const std::unordered_map<::ir::Value, std::vector>& outputs); const std::unordered_set<::ir::Value>& NoNeedBuffer() const { - return no_need_buffer_; + return no_need_buffer_values_; } - void SetNoNeedBuffer(const std::unordered_set<::ir::Value>& no_need_buffer) { - no_need_buffer_ = no_need_buffer; + void SetNoNeedBuffer( + const std::unordered_set<::ir::Value>& no_need_buffer_values) { + no_need_buffer_values_ = no_need_buffer_values; } virtual void Run() = 0; @@ -171,7 +172,7 @@ class InstructionBase { std::unordered_map<::ir::Value, std::vector> output_index_; - std::unordered_set<::ir::Value> no_need_buffer_; + std::unordered_set<::ir::Value> no_need_buffer_values_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index 8ee66bfa1cea28..f24e927a5909a8 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -296,10 +296,10 @@ PhiKernelInstruction::PhiKernelInstruction( op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name); VLOG(6) << "finish process inputs outputs index"; - auto no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds(); + auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds(); std::unordered_set<::ir::Value> no_need_buffer_values; for (size_t id = 0; id < no_need_buffer_ids.size(); id++) { - no_need_buffer_values.insert(op->operand(id)); + no_need_buffer_values.insert(op->operand(no_need_buffer_ids[id])); } SetNoNeedBuffer(no_need_buffer_values); VLOG(6) << "finish process no need buffer"; From 004b61e06ca92cda5d5de70757d66f762622dfe1 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 27 Jul 2023 02:34:40 +0000 Subject: [PATCH 14/18] refine code --- .../instruction/phi_kernel_instruction.cc | 44 ++----------------- .../interpreter/stream_analyzer.cc | 36 --------------- .../interpreter/stream_analyzer.h | 37 ++++++++++++++++ 3 files changed, 41 insertions(+), 76 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index f24e927a5909a8..39e791aca3f8ac 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/ir/dialect/pd_dialect.h" #include "paddle/fluid/ir/interface/infermeta.h" @@ -34,44 +35,6 @@ namespace paddle { namespace framework { -using DeviceContext = paddle::platform::DeviceContext; - -class IrContextManager { - public: - using DeviceContextMap = - std::map>>; - - static IrContextManager& Instance() { - static IrContextManager* ctx_manager = new IrContextManager; - return *ctx_manager; - } - - std::shared_future> Get( - const std::string& type, - const platform::Place& place, - int stream_priority) { - std::lock_guard lk(ctx_mtx_); - VLOG(6) << "Get dev_ctx for " << type << " - " << place; - - DeviceContextMap& ctxs = ctx_pool_[type]; - if (ctxs.find(place) == ctxs.end()) { - platform::EmplaceDeviceContexts( - &ctxs, - {place}, - /*disable_setting_default_stream_for_allocator=*/true, - stream_priority); - } - return ctxs[place]; - } - - private: - IrContextManager() {} - DISABLE_COPY_AND_ASSIGN(IrContextManager); - - std::mutex ctx_mtx_; - std::unordered_map ctx_pool_; -}; - platform::DeviceContext* ParseDeviceContext( ir::Operation* op, platform::DeviceContext* origin_dev_ctx, @@ -81,9 +44,10 @@ platform::DeviceContext* ParseDeviceContext( auto op_attributes = op->attributes(); auto op_name = op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); - IrContextManager& ctx_manager = IrContextManager::Instance(); + interpreter::ContextManager& ctx_manager = + interpreter::ContextManager::Instance(); - DeviceContext* dev_ctx = nullptr; + platform::DeviceContext* dev_ctx = nullptr; // only gpu need update. xpu not need, because xpu memcpy op kernel is // synchronous. diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 620aa073bb3787..9a28708730a48c 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -29,42 +29,6 @@ namespace interpreter { using DeviceContext = platform::DeviceContext; using DeviceEvent = platform::DeviceEvent; -class ContextManager { - public: - using DeviceContextMap = - std::map>>; - - static ContextManager& Instance() { - static ContextManager* ctx_manager = new ContextManager; - return *ctx_manager; - } - - std::shared_future> Get( - const std::string& type, - const platform::Place& place, - int stream_priority) { - std::lock_guard lk(ctx_mtx_); - VLOG(6) << "Get dev_ctx for " << type << " - " << place; - - DeviceContextMap& ctxs = ctx_pool_[type]; - if (ctxs.find(place) == ctxs.end()) { - platform::EmplaceDeviceContexts( - &ctxs, - {place}, - /*disable_setting_default_stream_for_allocator=*/true, - stream_priority); - } - return ctxs[place]; - } - - private: - ContextManager() {} - DISABLE_COPY_AND_ASSIGN(ContextManager); - - std::mutex ctx_mtx_; - std::unordered_map ctx_pool_; -}; - inline std::string RunTypeToString(DownstreamRunType run_type) { if (run_type == DownstreamRunType::kDirectRun) { return "DirectRun"; diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h index 1cb181bfeb0833..1e738da044d226 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h @@ -28,6 +28,43 @@ namespace interpreter { enum DownstreamRunType { kDirectRun, kEventRun }; +class ContextManager { + public: + using DeviceContextMap = + std::map>>; + + static ContextManager& Instance() { + static ContextManager* ctx_manager = new ContextManager; + return *ctx_manager; + } + + std::shared_future> Get( + const std::string& type, + const platform::Place& place, + int stream_priority) { + std::lock_guard lk(ctx_mtx_); + VLOG(6) << "Get dev_ctx for " << type << " - " << place; + + DeviceContextMap& ctxs = ctx_pool_[type]; + if (ctxs.find(place) == ctxs.end()) { + platform::EmplaceDeviceContexts( + &ctxs, + {place}, + /*disable_setting_default_stream_for_allocator=*/true, + stream_priority); + } + return ctxs[place]; + } + + private: + ContextManager() {} + DISABLE_COPY_AND_ASSIGN(ContextManager); + + std::mutex ctx_mtx_; + std::unordered_map ctx_pool_; +}; + class StreamAnalyzer { public: using DeviceContext = platform::DeviceContext; From c73d2d95bce806432a544ca6a1c8d057f4d2dde3 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 27 Jul 2023 12:27:23 +0000 Subject: [PATCH 15/18] refine code --- .../interpreter/dependency_builder.cc | 199 ++---- .../interpreter/dependency_builder.h | 62 +- .../interpreter/interpreter_util.cc | 16 +- .../interpreter/interpreter_util.h | 8 +- .../interpreter/stream_analyzer.cc | 589 +++++++----------- .../interpreter/stream_analyzer.h | 58 +- .../new_executor/new_ir_interpreter.cc | 25 +- .../new_executor/new_ir_interpreter.h | 6 +- 8 files changed, 323 insertions(+), 640 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index 0b878fcc9a5338..fcdb95d86923b7 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -405,22 +405,6 @@ void DependencyBuilder::BuildDownstreamMap() { op2dependences[op_idx] = std::set(); } - auto update_var_min_rw_op = - [](const std::map>& op2dependences, - std::map>* var2min_rw_op, - size_t cur_op, - size_t rw_var) { - // rw_var is inputs or outputs of cur_op - // this function update the var2min_rw_op set . - if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) { - (*var2min_rw_op)[rw_var] = std::list(); - } - for (auto dep_op : op2dependences.at(cur_op)) { - var2min_rw_op->at(rw_var).remove(dep_op); - } - var2min_rw_op->at(rw_var).push_back(cur_op); - }; - for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { remove_duplicate.clear(); // step1: update the op2dependences structure @@ -485,7 +469,7 @@ void DependencyBuilder::BuildDownstreamMap() { for (auto var : item.second) { if (remove_duplicate.count(var) == 0) { // var in input list and in output list, so remove it. - update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); + UpdateVarMinRwOp(op2dependences, &var2min_rw_op, op_idx, var); } } } @@ -546,21 +530,46 @@ void DependencyBuilder::ShrinkDownstreamMap() { << StringizeDownstreamMap(*op_downstream_map_); } +void DependencyBuilder::UpdateVarMinRwOp( + const std::map>& op2dependences, + std::map>* var2min_rw_op, + size_t cur_op, + size_t rw_var) { + // rw_var is inputs or outputs of cur_op + // this function update the var2min_rw_op set . + if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) { + (*var2min_rw_op)[rw_var] = std::list(); + } + for (auto dep_op : op2dependences.at(cur_op)) { + var2min_rw_op->at(rw_var).remove(dep_op); + } + var2min_rw_op->at(rw_var).push_back(cur_op); +} + /// ======================== /// /// For new ir /// /// ======================== /// -const std::map>& IrDependencyBuilder::Build( - const std::vector& instructions) { +NewIrDependencyBuilder::NewIrDependencyBuilder() : instructions_(nullptr) { + is_build_ = false; + op_downstream_map_ = std::make_shared>>(); + op_happens_before_ = std::make_shared>>(); +} + +const std::map>& NewIrDependencyBuilder::Build( + const std::vector>& + instructions) { if (is_build_) { - return op_downstream_map_; + return *op_downstream_map_; } + std::tie(op_downstream_map_, op_happens_before_) = GetDependency(); + instructions_ = &instructions; op_num_ = instructions_->size(); ops_before_.assign(op_num_, {}); ops_behind_.assign(op_num_, {}); - op_happens_before_.assign(op_num_, std::vector(op_num_, false)); + op_happens_before_->assign(op_num_, std::vector(op_num_, false)); BuildDownstreamMap(); VLOG(6) << "Finish BuildDownstreamMap"; @@ -575,16 +584,16 @@ const std::map>& IrDependencyBuilder::Build( // TODO(zhangbo): Add dependency for special op ? VLOG(6) << "Finish build dependency"; - VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); + VLOG(8) << "downstream count: " << CountDownstreamMap(*op_downstream_map_); VLOG(8) << "downstream_map: " << std::endl - << StringizeDownstreamMap(op_downstream_map_); + << StringizeDownstreamMap(*op_downstream_map_); is_build_ = true; - return op_downstream_map_; + return *op_downstream_map_; } -void IrDependencyBuilder::BuildDownstreamMap() { +void NewIrDependencyBuilder::BuildDownstreamMap() { auto var2min_rw_op = std::map>(); // # map from variable id to read // write op id. @@ -603,22 +612,6 @@ void IrDependencyBuilder::BuildDownstreamMap() { op2dependences[op_idx] = std::set(); } - auto update_var_min_rw_op = - [](const std::map>& op2dependences, - std::map>* var2min_rw_op, - size_t cur_op, - size_t rw_var) { - // rw_var is inputs or outputs of cur_op - // this function update the var2min_rw_op set . - if (var2min_rw_op->find(rw_var) == var2min_rw_op->end()) { - (*var2min_rw_op)[rw_var] = std::list(); - } - for (auto dep_op : op2dependences.at(cur_op)) { - var2min_rw_op->at(rw_var).remove(dep_op); - } - var2min_rw_op->at(rw_var).push_back(cur_op); - }; - for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { remove_duplicate.clear(); // step1: update the op2dependences structure @@ -656,7 +649,7 @@ void IrDependencyBuilder::BuildDownstreamMap() { for (auto var : item.second) { if (remove_duplicate.count(var) == 0) { // var in input list and in output list, so remove it. - update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); + UpdateVarMinRwOp(op2dependences, &var2min_rw_op, op_idx, var); } } } @@ -674,128 +667,6 @@ void IrDependencyBuilder::BuildDownstreamMap() { } } -void IrDependencyBuilder::AddDownstreamOp(size_t prior_op_idx, - size_t posterior_op_idx) { - PADDLE_ENFORCE_EQ( - OpHappensBefore(posterior_op_idx, prior_op_idx), - false, - phi::errors::Unavailable( - "Can not add dependency %d->%d because %d is run before %d", - prior_op_idx, - posterior_op_idx, - posterior_op_idx, - prior_op_idx)); - - std::set& downstream_ops = op_downstream_map_[prior_op_idx]; - // NOTE(Ruibiao): Here the downstream map shrinking is best-effort, therefore - // ShrinkDownstreamMap after BuildDownstreamMap is still helpful. For example, - // a->c will not be shrinked in the following case: AddDownstreamOp(a, b) -> - // AddDownstreamOp(a, c) -> AddDownstreamOp(b, c), it should be shrinked by - // ShrinkDownstreamMap. - for (size_t op_idx : downstream_ops) { - if (OpHappensBefore(op_idx, posterior_op_idx)) { - VLOG(7) << "Find dependencies " << prior_op_idx << "->" << op_idx << "->" - << posterior_op_idx << ", skip adding " << prior_op_idx << "->" - << posterior_op_idx; - return; - } - } - downstream_ops.insert(posterior_op_idx); - - std::vector prior_of_prior = ops_before_[prior_op_idx]; - std::vector posterior_of_posterior = ops_behind_[posterior_op_idx]; - - auto update_op_happen_before = [this](size_t prior_op_idx, - size_t posterior_op_idx) { - if (!op_happens_before_[prior_op_idx][posterior_op_idx]) { - op_happens_before_[prior_op_idx][posterior_op_idx] = true; - ops_before_[posterior_op_idx].push_back(prior_op_idx); - ops_behind_[prior_op_idx].push_back(posterior_op_idx); - } - }; - - update_op_happen_before(prior_op_idx, posterior_op_idx); - - // All ops before prior-op are also before posterior-op - for (size_t op_idx : prior_of_prior) { - update_op_happen_before(op_idx, posterior_op_idx); - } - - // All ops after posterior-op are also after prior-op - for (size_t op_idx : posterior_of_posterior) { - update_op_happen_before(prior_op_idx, op_idx); - } - - VLOG(8) << prior_op_idx << "->" << posterior_op_idx; - VLOG(8) << "Add dependency from " << instructions_->at(prior_op_idx)->Name() - << "(" << prior_op_idx << ") to " - << instructions_->at(posterior_op_idx)->Name() << "(" - << posterior_op_idx << ")"; -} - -void IrDependencyBuilder::ShrinkDownstreamMap() { - // remove unnecessary downstream ops - // for example, a->b->c - // a: b, c - // b: c - // => - // a: b - // b: c - - // shrink, find the downstream op that has no other op in the - // downstream list happens before it - for (size_t i = 0; i < op_num_; ++i) { - if (op_downstream_map_.find(i) == op_downstream_map_.end()) { - continue; - } - - std::set minumum_nexts; - for (size_t item : op_downstream_map_.at(i)) { - bool not_after_any = true; - // find the op that is not executed after any - for (size_t other_item : op_downstream_map_.at(i)) { - if (OpHappensBefore(other_item, item)) { - VLOG(8) << "happens_before: " << other_item << "->" << item - << ", so skip " << item; - not_after_any = false; - break; - } - } - if (not_after_any) { - VLOG(8) << "downstream op of " << i << ": " << item; - minumum_nexts.insert(item); - } - } - // NOTE(Ruibiao): op_happens_before will not be changed when shrink - // dowstream map - op_downstream_map_.at(i) = minumum_nexts; - } - VLOG(8) << "Finish shrink downstream map"; - VLOG(8) << "downstream count: " << CountDownstreamMap(op_downstream_map_); - VLOG(8) << "downstream_map: " << std::endl - << StringizeDownstreamMap(op_downstream_map_); -} - -void IrDependencyBuilder::AddDependencyForSequentialRun() { - size_t dependence_op_idx = ULLONG_MAX; - for (size_t op_idx = 0; op_idx < op_num_; ++op_idx) { - if (dependence_op_idx != ULLONG_MAX) { - AddDownstreamOp(dependence_op_idx, op_idx); - } - dependence_op_idx = op_idx; - } -} - -const std::map>& IrDependencyBuilder::OpDownstreamMap() - const { - PADDLE_ENFORCE_EQ( - is_build_, - true, - phi::errors::Unavailable( - "DependencyBuilder is not yet built, call Build() firstly.")); - return op_downstream_map_; -} - } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h index 8f9669513a50c3..ee04bd75201b47 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h @@ -57,7 +57,7 @@ class DependencyBuilder { void ShareDependencyFrom(const DependencyBuilder& src); - private: + protected: void AddDependencyForCoalesceTensorOp(); void AddDependencyForCommunicationOp(); void AddDependencyForRandomOp(); @@ -70,8 +70,14 @@ class DependencyBuilder { void ShrinkDownstreamMap(); + void UpdateVarMinRwOp( + const std::map>& op2dependences, + std::map>* var2min_rw_op, + size_t cur_op, + size_t rw_var); + bool is_build_; - const std::vector* instructions_; // not_own + size_t op_num_; // ops_behind_ is the adjacency list about op to its posterior-ops, that is to @@ -89,63 +95,29 @@ class DependencyBuilder { // op_happens_before_ is a matrix form of ops_before_ and ops_behind_, it is // used to speed up the query. std::shared_ptr>> op_happens_before_; + + private: + const std::vector* instructions_; // not_own }; /// ======================== /// /// For new ir /// /// ======================== /// -class IrDependencyBuilder { +class NewIrDependencyBuilder : public DependencyBuilder { public: - IrDependencyBuilder() : is_build_(false), instructions_(nullptr) {} + NewIrDependencyBuilder(); // build op dependencies and return the mapping from op to its downstream-op // set const std::map>& Build( - const std::vector& instructions); - - const std::map>& OpDownstreamMap() const; - - bool OpHappensBefore(size_t prior_op_idx, size_t posterior_op_idx) const { - PADDLE_ENFORCE_GE( - op_happens_before_.size(), - 0, - phi::errors::Unavailable("op_happen_before is not yet built")); - return op_happens_before_.at(prior_op_idx).at(posterior_op_idx); - } - - private: - void AddDependencyForCoalesceTensorOp(); - void AddDependencyForCommunicationOp(); - void AddDependencyForRandomOp(); - void AddDependencyForReadOp(); - void AddDependencyForSequentialRun(); - - void AddDownstreamOp(size_t prior_op_idx, size_t posterior_op_idx); + const std::vector>& + instructions); void BuildDownstreamMap(); - void ShrinkDownstreamMap(); - - bool is_build_; - const std::vector* + private: + const std::vector>* instructions_; // not_own - size_t op_num_; - - // ops_behind_ is the adjacency list about op to its posterior-ops, that is to - // say, op_behind_[i] == {a, b, c} means op[a], op[b] and op[c] depend on - // op[i] directly or indirectly. ops_before_ is the revered adjacency list of - // ops_behind_. - std::vector> ops_before_; - std::vector> ops_behind_; - - // op_downstream_map_ is the mapping from op to its downstream-op set, that is - // to say, op_downstream_map_[i] == {a, b, c} means op[a], op[b] and op[c] - // depend on op[i] directly. - std::map> op_downstream_map_; - - // op_happens_before_ is a matrix form of ops_before_ and ops_behind_, it is - // used to speed up the query. - std::vector> op_happens_before_; }; } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 7583872e95a0ff..7793c4a37ba553 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -157,7 +157,15 @@ bool IsCpuOp(const Instruction& instr) { return platform::is_cpu_place(instr.DeviceContext().GetPlace()); } -bool IsCpuOp(const paddle::framework::InstructionBase* instr) { +bool IsCpuOp(Instruction* instr) { + return platform::is_cpu_place(instr->DeviceContext().GetPlace()); +} + +bool IsCpuOp(const paddle::framework::InstructionBase& instr) { + return platform::is_cpu_place(instr.DeviceContext().GetPlace()); +} + +bool IsCpuOp(paddle::framework::InstructionBase* instr) { return platform::is_cpu_place(instr->DeviceContext().GetPlace()); } @@ -178,7 +186,11 @@ bool IsMemcpyH2D(const Instruction& instr) { return instr.OpBase()->Type() == kMemcpyH2D; } -bool IsMemcpyH2D(const paddle::framework::InstructionBase* instr) { +bool IsMemcpyH2D(Instruction* instr) { + return instr->OpBase()->Type() == kMemcpyH2D; +} + +bool IsMemcpyH2D(paddle::framework::InstructionBase* instr) { return instr->Name() == "pd.memcpy_h2d"; } diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index e31e60ed6e6601..b37e46d5206e66 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -72,6 +72,10 @@ bool IsCommunicationOp(const Instruction& instr); bool IsCpuOp(const Instruction& instr); +bool IsCpuOp(Instruction* instr); + +bool IsCpuOp(const paddle::framework::InstructionBase& instr); + bool IsCpuOp(const paddle::framework::InstructionBase* instr); bool IsGradOp(const std::string& op_name); @@ -80,7 +84,9 @@ bool IsMemcpyD2H(const Instruction& instr); bool IsMemcpyH2D(const Instruction& instr); -bool IsMemcpyH2D(const paddle::framework::InstructionBase* instr); +bool IsMemcpyH2D(Instruction* instr); + +bool IsMemcpyH2D(paddle::framework::InstructionBase* instr); bool IsMemcpyOp(const Instruction& instr); diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 9a28708730a48c..5ab2e2fff163a7 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -44,6 +44,11 @@ void StreamAnalyzer::ConstructEvents(std::vector* instructions) { cross_step_merged_instructions.emplace_back(instr); } + std::vector cross_step_merged_instructions_ptr; + for (Instruction& instr : cross_step_merged_instructions) { + cross_step_merged_instructions_ptr.emplace_back(&instr); + } + DependencyBuilder dependency_builder; dependency_builder.Build(cross_step_merged_instructions); @@ -56,10 +61,10 @@ void StreamAnalyzer::ConstructEvents(std::vector* instructions) { /*number_of_run_type = */ 2)); // instr_id -> run_type -> // next_instr_id AnalyseAllRunType( - cross_step_merged_instructions, downstream_map, &run_type_info); + cross_step_merged_instructions_ptr, downstream_map, &run_type_info); AnalyseAllEventInfo( - cross_step_merged_instructions, run_type_info, event_info_.get()); + cross_step_merged_instructions_ptr, run_type_info, event_info_.get()); ShrinkEventInfo(dependency_builder, event_info_.get()); is_event_info_build_ = true; } @@ -175,38 +180,42 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext( return op_func_node.dev_ctx_; } -bool StreamAnalyzer::HasDataDependency(const Instruction& cur_instr, - const Instruction& next_instr) const { - auto no_need_buffer_ins = - [](const Instruction& instr) -> const std::unordered_set { - auto* op = instr.OpBase(); - auto& inferer = op->Info().NoNeedBufferVarsInferer(); - if (inferer) { - return inferer(op->Inputs(), op->Outputs(), op->Attrs()); - } - return std::unordered_set(); - }; +const std::unordered_set no_need_buffer_ins(Instruction* instr) { + auto* op = instr->OpBase(); + auto& inferer = op->Info().NoNeedBufferVarsInferer(); + if (inferer) { + return inferer(op->Inputs(), op->Outputs(), op->Attrs()); + } + return std::unordered_set(); +} +const std::unordered_set no_need_buffer_ins( + const paddle::framework::InstructionBase* instr) { + return instr->NoNeedBuffer(); +} + +template +bool has_data_dependency(T1* cur_instr, T1* next_instr) { // cur_instr->var->next_instr std::unordered_set cur_var_ids; - for (auto& item : cur_instr.Outputs()) { + for (auto& item : cur_instr->Outputs()) { cur_var_ids.insert(item.second.begin(), item.second.end()); } - const std::unordered_set next_instr_no_need_buffer_ins = + const std::unordered_set next_instr_no_need_buffer_ins = no_need_buffer_ins(next_instr); - for (auto& item : next_instr.Inputs()) { + for (auto& item : next_instr->Inputs()) { if (next_instr_no_need_buffer_ins.find(item.first) != next_instr_no_need_buffer_ins.end()) { continue; } for (auto next_var_id : item.second) { if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { - VLOG(6) << "Found data dependency from " << cur_instr.OpBase()->Type() - << "(" << cur_instr.Id() << ") to " - << next_instr.OpBase()->Type() << "(" << next_instr.Id() - << ") at variable " << item.first << "(" << next_var_id << ")"; + VLOG(6) << "Found data dependency from " + << "cur_instr(" << cur_instr->Id() << ") to " + << "next_instr(" << next_instr->Id() << ") at variable " + << item.first << "(" << next_var_id << ")"; return true; } } @@ -214,22 +223,22 @@ bool StreamAnalyzer::HasDataDependency(const Instruction& cur_instr, // cur_instr->var && next_instr->var // var->cur_instr && next_instr->var - const std::unordered_set cur_instr_no_need_buffer_ins = + const std::unordered_set cur_instr_no_need_buffer_ins = no_need_buffer_ins(cur_instr); - for (auto& item : cur_instr.Inputs()) { + for (auto& item : cur_instr->Inputs()) { if (cur_instr_no_need_buffer_ins.find(item.first) == cur_instr_no_need_buffer_ins.end()) { cur_var_ids.insert(item.second.begin(), item.second.end()); } } - for (auto& item : next_instr.Outputs()) { + for (auto& item : next_instr->Outputs()) { for (auto next_var_id : item.second) { if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { - VLOG(6) << "Found data dependency from " << cur_instr.OpBase()->Type() - << "(" << cur_instr.Id() << ") to " - << next_instr.OpBase()->Type() << "(" << next_instr.Id() - << ") at variable " << item.first << "(" << next_var_id << ")"; + VLOG(6) << "Found data dependency from " + << "cur_instr(" << cur_instr->Id() << ") to " + << "next_instr(" << next_instr->Id() << ") at variable " + << item.first << "(" << next_var_id << ")"; return true; } } @@ -238,64 +247,80 @@ bool StreamAnalyzer::HasDataDependency(const Instruction& cur_instr, return false; } -void StreamAnalyzer::AnalyseAllEventInfo( - const std::vector& instructions, - const std::vector>>& run_type_info, - std::map>>* - event_info) const { - for (size_t cur_instr_id = 0; cur_instr_id < instructions.size(); - ++cur_instr_id) { - const std::vector& next_instr_ids = - run_type_info[cur_instr_id][DownstreamRunType::kEventRun]; - std::set waiter_instr_ids; - std::set visited_next_instr_id; +template +DownstreamRunType analyse_run_type_for_two_instructions(T* cur_instr, + T* next_instr, + const Place& place) { + // xpu&ipu memcpy kerenl is synchronous. + if (platform::is_ipu_place(place) || platform::is_xpu_place(place)) { + return DownstreamRunType::kDirectRun; + } - for (size_t next_instr_id : next_instr_ids) { - AnalyseEventInfoForTwoInstructions(instructions, - run_type_info, - cur_instr_id, - next_instr_id, - &waiter_instr_ids, - &visited_next_instr_id); + // npu d2h kernel is asynchronous. + if (platform::is_custom_place(place)) { + if (interpreter::IsCpuOp(cur_instr) || + interpreter::IsMemcpyH2D(next_instr)) { + return DownstreamRunType::kDirectRun; } + } - for (size_t waiter_instr_id : waiter_instr_ids) { - (*event_info)[&(instructions[cur_instr_id].DeviceContext())] - [waiter_instr_id] - .insert(cur_instr_id); - } + if (cur_instr->KernelType() == OpFuncType::kGpuAsync && + (&cur_instr->DeviceContext() != &next_instr->DeviceContext())) { + return DownstreamRunType::kEventRun; } + + return DownstreamRunType::kDirectRun; } -void StreamAnalyzer::AnalyseAllRunType( - const std::vector& instructions, +template +void analyse_all_run_type( + const std::vector& instructions, const std::map>& downstream_map, - std::vector>>* run_type_info) const { + const Place& place, + std::vector>>* run_type_info) { for (auto& item : downstream_map) { size_t cur_instr_id = item.first; - const Instruction& cur_instr = instructions[item.first]; + T* cur_instr = instructions[item.first]; for (size_t next_instr_id : item.second) { - const Instruction& next_instr = instructions[next_instr_id]; - DownstreamRunType run_type = - AnalyseRunTypeForTwoInstructions(cur_instr, next_instr); + T* next_instr = instructions[next_instr_id]; + DownstreamRunType run_type = analyse_run_type_for_two_instructions( + cur_instr, next_instr, place); (*run_type_info)[cur_instr_id][run_type].push_back(next_instr_id); - VLOG(6) << RunTypeToString(run_type) << ": " << cur_instr.OpBase()->Type() - << "(" << cur_instr_id << ") -> " << next_instr.OpBase()->Type() - << "(" << next_instr_id << ")"; + VLOG(6) << RunTypeToString(run_type) << ": " + << "cur_instr_id(" << cur_instr_id << ") -> " + << "next_instr_id(" << next_instr_id << ")"; } } } +void StreamAnalyzer::AnalyseAllRunType( + const std::vector& instructions, + const std::map>& downstream_map, + std::vector>>* run_type_info) const { + analyse_all_run_type( + instructions, downstream_map, place_, run_type_info); +} + // The caller should guarantee cur_instr and next_instr is kEventRun -void StreamAnalyzer::AnalyseEventInfoForTwoInstructions( - const std::vector& instructions, +template +void analyse_event_info_for_two_instructions( + const std::vector& instructions, const std::vector>>& run_type_info, const size_t cur_instr_id, const size_t next_instr_id, std::set* waiter_instr_ids, - std::set* visited_next_instr_id) const { + std::set* visited_next_instr_id); + +template <> +void analyse_event_info_for_two_instructions( + const std::vector& instructions, + const std::vector>>& run_type_info, + const size_t cur_instr_id, + const size_t next_instr_id, + std::set* waiter_instr_ids, + std::set* visited_next_instr_id) { if (visited_next_instr_id->find(next_instr_id) != visited_next_instr_id->end()) { return; @@ -320,10 +345,11 @@ void StreamAnalyzer::AnalyseEventInfoForTwoInstructions( // There is actually a data dependency between op1 and op2 that var0 and // fused_var share the same tensor. However, as the dependency is implicit, we // can only add event for it with the help of depend_op. - if (HasDataDependency(instructions[cur_instr_id], - instructions[next_instr_id]) || + + if (has_data_dependency( + instructions[cur_instr_id], instructions[next_instr_id]) || !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() || - instructions[next_instr_id].OpBase()->Type() == "depend") { + instructions[next_instr_id]->OpBase()->Type() == "depend") { waiter_instr_ids->insert(next_instr_id); return; } @@ -337,19 +363,58 @@ void StreamAnalyzer::AnalyseEventInfoForTwoInstructions( // between cur_instr and next_instr. for (size_t instr_id : run_type_info[next_instr_id][DownstreamRunType::kDirectRun]) { - AnalyseEventInfoForTwoInstructions(instructions, - run_type_info, - cur_instr_id, - instr_id, - waiter_instr_ids, - visited_next_instr_id); + analyse_event_info_for_two_instructions(instructions, + run_type_info, + cur_instr_id, + instr_id, + waiter_instr_ids, + visited_next_instr_id); } } -void StreamAnalyzer::ShrinkEventInfo( - const DependencyBuilder& dependency_builder, +template +void analyse_all_event_info( + const std::vector& instructions, + const std::vector>>& run_type_info, + std::map>>* + event_info) { + for (size_t cur_instr_id = 0; cur_instr_id < instructions.size(); + ++cur_instr_id) { + const std::vector& next_instr_ids = + run_type_info[cur_instr_id][DownstreamRunType::kEventRun]; + std::set waiter_instr_ids; + std::set visited_next_instr_id; + + for (size_t next_instr_id : next_instr_ids) { + analyse_event_info_for_two_instructions(instructions, + run_type_info, + cur_instr_id, + next_instr_id, + &waiter_instr_ids, + &visited_next_instr_id); + } + + for (size_t waiter_instr_id : waiter_instr_ids) { + (*event_info)[&(instructions[cur_instr_id]->DeviceContext())] + [waiter_instr_id] + .insert(cur_instr_id); + } + } +} + +void StreamAnalyzer::AnalyseAllEventInfo( + const std::vector& instructions, + const std::vector>>& run_type_info, std::map>>* event_info) const { + analyse_all_event_info(instructions, run_type_info, event_info); +} + +template +void shrink_event_info( + const T& dependency_builder, + std::map>>* + event_info) { for (auto& item : *event_info) { // shrink redundant recorders, waiter instrs should only wait for the last // recorder instrs in each stream @@ -409,6 +474,13 @@ void StreamAnalyzer::ShrinkEventInfo( } } +void StreamAnalyzer::ShrinkEventInfo( + const DependencyBuilder& dependency_builder, + std::map>>* + event_info) const { + shrink_event_info(dependency_builder, event_info); +} + platform::DeviceType StreamAnalyzer::GetWaiterType( const Instruction& instr) const { if (instr.KernelType() == OpFuncType::kCpuSync) { @@ -423,29 +495,6 @@ platform::DeviceType StreamAnalyzer::GetWaiterType( } } -DownstreamRunType StreamAnalyzer::AnalyseRunTypeForTwoInstructions( - const Instruction& cur_instr, const Instruction& next_instr) const { - // xpu&ipu memcpy kerenl is synchronous. - if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_)) { - return DownstreamRunType::kDirectRun; - } - - // npu d2h kernel is asynchronous. - if (platform::is_custom_place(place_)) { - if (interpreter::IsCpuOp(cur_instr) || - interpreter::IsMemcpyH2D(next_instr)) { - return DownstreamRunType::kDirectRun; - } - } - - if (cur_instr.KernelType() == OpFuncType::kGpuAsync && - (&cur_instr.DeviceContext() != &next_instr.DeviceContext())) { - return DownstreamRunType::kEventRun; - } - - return DownstreamRunType::kDirectRun; -} - std::shared_ptr< std::map>>> StreamAnalyzer::GetEventInfo() const { @@ -460,37 +509,47 @@ void StreamAnalyzer::ShareEventInfoFrom(const StreamAnalyzer& src) { /// ======================== /// /// For new ir /// /// ======================== /// -void IrStreamAnalyzer::ConstructEvents( - std::vector* instructions) const { - std::vector - cross_step_merged_instructions = *instructions; - for (paddle::framework::InstructionBase* instr : *instructions) { - cross_step_merged_instructions.emplace_back(instr); - } +void NewIrStreamAnalyzer::ConstructEvents( + std::vector>* + instructions) { + if (!is_event_info_build_) { + std::vector> + cross_step_merged_instructions = *instructions; + for (auto instr : *instructions) { + cross_step_merged_instructions.emplace_back(instr); + } - IrDependencyBuilder dependency_builder; - dependency_builder.Build(cross_step_merged_instructions); - - const std::map>& downstream_map = - dependency_builder.OpDownstreamMap(); - const size_t instr_num = cross_step_merged_instructions.size(); - std::vector>> run_type_info( - instr_num, - std::vector>( - /*number_of_run_type = */ 2)); // instr_id -> run_type -> - // next_instr_id - AnalyseAllRunType( - cross_step_merged_instructions, downstream_map, &run_type_info); - - std::map>> - event_info; // DeviceContext -> waiter_instr_id -> recorder_instr_ids - AnalyseAllEventInfo( - cross_step_merged_instructions, run_type_info, &event_info); - ShrinkEventInfo(dependency_builder, &event_info); + std::vector + cross_step_merged_instructions_ptr; + for (auto instr : cross_step_merged_instructions) { + cross_step_merged_instructions_ptr.push_back(instr.get()); + } + + NewIrDependencyBuilder dependency_builder; + dependency_builder.Build(cross_step_merged_instructions); + const std::map>& downstream_map = + dependency_builder.OpDownstreamMap(); + + const size_t instr_num = cross_step_merged_instructions.size(); + std::vector>> run_type_info( + instr_num, + std::vector>( + /*number_of_run_type = */ 2)); // instr_id -> run_type -> + // next_instr_id + AnalyseAllRunType( + cross_step_merged_instructions_ptr, downstream_map, &run_type_info); + + AnalyseAllEventInfo( + cross_step_merged_instructions_ptr, run_type_info, event_info_.get()); + + ShrinkEventInfo(dependency_builder, event_info_.get()); + + is_event_info_build_ = true; + } // Construct events std::map> instr2event; - for (auto& context_item : event_info) { + for (auto& context_item : *event_info_) { for (auto& waiter_item : context_item.second) { size_t waiter_instr_id = waiter_item.first; std::set& recorder_instr_ids = waiter_item.second; @@ -506,9 +565,9 @@ void IrStreamAnalyzer::ConstructEvents( } paddle::framework::InstructionBase* recorder_instr = - instructions->at(recorder_instr_id); + instructions->at(recorder_instr_id).get(); paddle::framework::InstructionBase* waiter_instr = - instructions->at(waiter_instr_id); + instructions->at(waiter_instr_id).get(); platform::DeviceType waiter_type = GetWaiterType(waiter_instr); if (instr2event.find(recorder_instr_id) == instr2event.end()) { @@ -531,273 +590,55 @@ void IrStreamAnalyzer::ConstructEvents( } } -platform::DeviceType IrStreamAnalyzer::GetWaiterType( - const paddle::framework::InstructionBase* instr) const { - if (instr->KernelType() == OpFuncType::kCpuSync) { - return platform::kCPU; - } else { - if (platform::is_xpu_place(place_)) { - return platform::kXPU; - } else if (platform::is_custom_place(place_)) { - return platform::kCUSTOM_DEVICE; - } - return platform::kCUDA; - } -} - -void IrStreamAnalyzer::AnalyseAllRunType( +void NewIrStreamAnalyzer::AnalyseAllRunType( const std::vector& instructions, const std::map>& downstream_map, std::vector>>* run_type_info) const { - for (auto& item : downstream_map) { - size_t cur_instr_id = item.first; - const paddle::framework::InstructionBase* cur_instr = - instructions[item.first]; - for (size_t next_instr_id : item.second) { - const paddle::framework::InstructionBase* next_instr = - instructions[next_instr_id]; - DownstreamRunType run_type = - AnalyseRunTypeForTwoInstructions(cur_instr, next_instr); - - (*run_type_info)[cur_instr_id][run_type].push_back(next_instr_id); - - VLOG(6) << RunTypeToString(run_type) << ": " << cur_instr->Name() << "(" - << cur_instr_id << ") -> " << next_instr->Name() << "(" - << next_instr_id << ")"; - } - } -} - -DownstreamRunType IrStreamAnalyzer::AnalyseRunTypeForTwoInstructions( - const paddle::framework::InstructionBase* cur_instr, - const paddle::framework::InstructionBase* next_instr) const { - // xpu&ipu memcpy kerenl is synchronous. - if (platform::is_ipu_place(place_) || platform::is_xpu_place(place_)) { - return DownstreamRunType::kDirectRun; - } - - // npu d2h kernel is asynchronous. - if (platform::is_custom_place(place_)) { - if (interpreter::IsCpuOp(cur_instr) || - interpreter::IsMemcpyH2D(next_instr)) { - return DownstreamRunType::kDirectRun; - } - } - - if (cur_instr->KernelType() == OpFuncType::kGpuAsync && - (&cur_instr->DeviceContext() != &next_instr->DeviceContext())) { - return DownstreamRunType::kEventRun; - } - - return DownstreamRunType::kDirectRun; + analyse_all_run_type( + instructions, downstream_map, place_, run_type_info); } -void IrStreamAnalyzer::AnalyseAllEventInfo( +void NewIrStreamAnalyzer::AnalyseAllEventInfo( const std::vector& instructions, const std::vector>>& run_type_info, std::map>>* event_info) const { - for (size_t cur_instr_id = 0; cur_instr_id < instructions.size(); - ++cur_instr_id) { - const std::vector& next_instr_ids = - run_type_info[cur_instr_id][DownstreamRunType::kEventRun]; - std::set waiter_instr_ids; - std::set visited_next_instr_id; - - for (size_t next_instr_id : next_instr_ids) { - AnalyseEventInfoForTwoInstructions(instructions, - run_type_info, - cur_instr_id, - next_instr_id, - &waiter_instr_ids, - &visited_next_instr_id); - } - - for (size_t waiter_instr_id : waiter_instr_ids) { - (*event_info)[&(instructions[cur_instr_id]->DeviceContext())] - [waiter_instr_id] - .insert(cur_instr_id); - } - } + analyse_all_event_info( + instructions, run_type_info, event_info); } -// The caller should guarantee cur_instr and next_instr is kEventRun -void IrStreamAnalyzer::AnalyseEventInfoForTwoInstructions( - const std::vector& instructions, - const std::vector>>& run_type_info, - const size_t cur_instr_id, - const size_t next_instr_id, - std::set* waiter_instr_ids, - std::set* visited_next_instr_id) const { - if (visited_next_instr_id->find(next_instr_id) != - visited_next_instr_id->end()) { - return; - } - visited_next_instr_id->insert(next_instr_id); - - // NOTE(Ruibiao): Though depend_op as next_instr is no_need_buffer, we should - // also wait event for it. Because depend_op is used to build dependencies for - // fused vars in some scenarios. In those cases, we do not know which vars may - // lead a implicit data dependency. For example, - // ### - // ### fused_var = fuse_op(var0, ...) - // ### var1 = op1(fused_var) - // ### var0 = depend_op(var0, fused_var) - // ### var2 = op2(var0) - // ### - // If op1 are cross-stream with depend_op and op2, then we have: - // ### - // ### event_run : op1 -> depend_op - // ### direct_run : depend_op -> op2 - // ### - // There is actually a data dependency between op1 and op2 that var0 and - // fused_var share the same tensor. However, as the dependency is implicit, we - // can only add event for it with the help of depend_op. - if (HasDataDependency(instructions[cur_instr_id], - instructions[next_instr_id]) || - !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() || - instructions[next_instr_id]->Name() == "depend") { - waiter_instr_ids->insert(next_instr_id); - return; - } - - // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and - // simultaneously next_instr has no event_run downstream instr, we try to - // recursively add events between cur_instr and next_instr's - // direct-run-instrs. This can delay the event wait and achieve better - // scheduling performance in some scenarios. However, when next_instr has too - // many direct-run-instrs, it may perform worse than add event directly - // between cur_instr and next_instr. - for (size_t instr_id : - run_type_info[next_instr_id][DownstreamRunType::kDirectRun]) { - AnalyseEventInfoForTwoInstructions(instructions, - run_type_info, - cur_instr_id, - instr_id, - waiter_instr_ids, - visited_next_instr_id); - } +void NewIrStreamAnalyzer::ShrinkEventInfo( + const NewIrDependencyBuilder& dependency_builder, + std::map>>* + event_info_map) const { + shrink_event_info(dependency_builder, event_info_map); } -bool IrStreamAnalyzer::HasDataDependency( - const paddle::framework::InstructionBase* cur_instr, - const paddle::framework::InstructionBase* next_instr) const { - auto no_need_buffer_ins = [](const paddle::framework::InstructionBase* instr) - -> const std::unordered_set { return instr->NoNeedBuffer(); }; - - // cur_instr->var->next_instr - std::unordered_set cur_var_ids; - for (auto& item : cur_instr->Outputs()) { - cur_var_ids.insert(item.second.begin(), item.second.end()); - } - - const std::unordered_set next_instr_no_need_buffer_ins = - no_need_buffer_ins(next_instr); - - for (auto& item : next_instr->Inputs()) { - if (next_instr_no_need_buffer_ins.find(item.first) != - next_instr_no_need_buffer_ins.end()) { - continue; - } - for (auto next_var_id : item.second) { - if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { - VLOG(6) << "Found data dependency from " << cur_instr->Name() << "(" - << cur_instr->Id() << ") to " << next_instr->Name() << "(" - << next_instr->Id() << ") at variable " << item.first.impl() - << "(" << next_var_id << ")"; - return true; - } - } - } - - // cur_instr->var && next_instr->var - // var->cur_instr && next_instr->var - const std::unordered_set cur_instr_no_need_buffer_ins = - no_need_buffer_ins(cur_instr); - for (auto& item : cur_instr->Inputs()) { - if (cur_instr_no_need_buffer_ins.find(item.first) == - cur_instr_no_need_buffer_ins.end()) { - cur_var_ids.insert(item.second.begin(), item.second.end()); - } - } - - for (auto& item : next_instr->Outputs()) { - for (auto next_var_id : item.second) { - if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { - VLOG(6) << "Found data dependency from " << cur_instr->Name() << "(" - << cur_instr->Id() << ") to " << next_instr->Name() << "(" - << next_instr->Id() << ") at variable " << item.first.impl() - << "(" << next_var_id << ")"; - return true; - } +platform::DeviceType NewIrStreamAnalyzer::GetWaiterType( + const paddle::framework::InstructionBase* instr) const { + if (instr->KernelType() == OpFuncType::kCpuSync) { + return platform::kCPU; + } else { + if (platform::is_xpu_place(place_)) { + return platform::kXPU; + } else if (platform::is_custom_place(place_)) { + return platform::kCUSTOM_DEVICE; } + return platform::kCUDA; } - - return false; } -void IrStreamAnalyzer::ShrinkEventInfo( - const IrDependencyBuilder& dependency_builder, - std::map>>* - event_info) const { - for (auto& item : *event_info) { - // shrink redundant recorders, waiter instrs should only wait for the last - // recorder instrs in each stream - std::map>& waiter_recorder_map = item.second; - for (auto& waiter_recorder : waiter_recorder_map) { - size_t waiter_instr_id = waiter_recorder.first; - std::set& recorder_instr_ids = waiter_recorder.second; - std::set unnecessary_recorder_instr_ids; - for (size_t cur_instr_id : recorder_instr_ids) { - for (size_t next_instr_id : recorder_instr_ids) { - if (dependency_builder.OpHappensBefore(cur_instr_id, next_instr_id)) { - unnecessary_recorder_instr_ids.insert(cur_instr_id); - break; - } - } - } - - for (size_t unnecessary_recorder_instr_id : - unnecessary_recorder_instr_ids) { - VLOG(8) << "Shrink event : " << unnecessary_recorder_instr_id << " -> " - << waiter_instr_id; - recorder_instr_ids.erase(unnecessary_recorder_instr_id); - } - } - - // shrink redundant waiters, recorder instrs should only wait by the first - // waiter instrs in each stream - std::map> recorder_waiter_map; - for (auto& waiter_recorder : waiter_recorder_map) { - size_t waiter_instr_id = waiter_recorder.first; - std::set& recorder_instr_ids = waiter_recorder.second; - for (size_t record_instr_id : recorder_instr_ids) { - recorder_waiter_map[record_instr_id].insert(waiter_instr_id); - } - } - - for (auto& recorder_waiter : recorder_waiter_map) { - size_t recorder_instr_id = recorder_waiter.first; - std::set& waiter_instr_ids = recorder_waiter.second; - std::set unnecessary_waiter_instr_ids; - for (size_t cur_instr_id : waiter_instr_ids) { - for (size_t next_instr_id : waiter_instr_ids) { - if (dependency_builder.OpHappensBefore(cur_instr_id, next_instr_id)) { - unnecessary_waiter_instr_ids.insert(next_instr_id); - break; - } - } - } +void NewIrStreamAnalyzer::ShareEventInfoFrom(const StreamAnalyzer& src) { + event_info_ = src.GetEventInfo(); + is_event_info_build_ = true; +} - for (size_t unnecessary_wiater_instr_id : unnecessary_waiter_instr_ids) { - VLOG(8) << "Shrink event : " << recorder_instr_id << " -> " - << unnecessary_wiater_instr_id; - waiter_recorder_map[unnecessary_wiater_instr_id].erase( - recorder_instr_id); - } - } - } +std::shared_ptr< + std::map>>> +NewIrStreamAnalyzer::GetEventInfo() const { + return event_info_; } + } // namespace interpreter } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h index 1e738da044d226..aa314b54eae33d 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h @@ -91,36 +91,24 @@ class StreamAnalyzer { GetEventInfo() const; private: - bool HasDataDependency(const Instruction& cur_instr, - const Instruction& next_instr) const; + bool HasDataDependency(Instruction* cur_instr, Instruction* next_instr) const; void AnalyseAllEventInfo( - const std::vector& instructions, + const std::vector& instructions, const std::vector>>& run_type_info, std::map>>* event_info) const; void AnalyseAllRunType( - const std::vector& instructions, + const std::vector& instructions, const std::map>& downstream_map, std::vector>>* run_type_info) const; - void AnalyseEventInfoForTwoInstructions( - const std::vector& instructions, - const std::vector>>& run_type_info, - const size_t cur_instr_id, - const size_t next_instr_id, - std::set* waiter_instr_ids, - std::set* visited_next_instr_id) const; - void ShrinkEventInfo( const DependencyBuilder& dependency_builder, std::map>>* event_info_map) const; - DownstreamRunType AnalyseRunTypeForTwoInstructions( - const Instruction& cur_instr, const Instruction& next_instr) const; - const Place place_; bool is_event_info_build_{false}; std::shared_ptr< @@ -131,55 +119,53 @@ class StreamAnalyzer { /// ======================== /// /// For new ir /// /// ======================== /// -class IrStreamAnalyzer { +class NewIrStreamAnalyzer { public: using DeviceContext = platform::DeviceContext; using Place = platform::Place; - explicit IrStreamAnalyzer(const Place& place) : place_(place) {} + explicit NewIrStreamAnalyzer(const Place& place) : place_(place) { + event_info_ = std::make_shared< + std::map>>>(); + } - ~IrStreamAnalyzer() {} + ~NewIrStreamAnalyzer() {} void ConstructEvents( - std::vector* instructions) const; + std::vector>* + instructions); platform::DeviceType GetWaiterType( const paddle::framework::InstructionBase* instr) const; + void ShareEventInfoFrom(const StreamAnalyzer& src); + + std::shared_ptr< + std::map>>> + GetEventInfo() const; + private: void AnalyseAllRunType( const std::vector& instructions, const std::map>& downstream_map, std::vector>>* run_type_info) const; - DownstreamRunType AnalyseRunTypeForTwoInstructions( - const paddle::framework::InstructionBase* cur_instr, - const paddle::framework::InstructionBase* next_instr) const; - void AnalyseAllEventInfo( const std::vector& instructions, const std::vector>>& run_type_info, std::map>>* event_info) const; - void AnalyseEventInfoForTwoInstructions( - const std::vector& instructions, - const std::vector>>& run_type_info, - const size_t cur_instr_id, - const size_t next_instr_id, - std::set* waiter_instr_ids, - std::set* visited_next_instr_id) const; - - bool HasDataDependency( - const paddle::framework::InstructionBase* cur_instr, - const paddle::framework::InstructionBase* next_instr) const; - void ShrinkEventInfo( - const IrDependencyBuilder& dependency_builder, + const NewIrDependencyBuilder& dependency_builder, std::map>>* event_info_map) const; const Place place_; + bool is_event_info_build_{false}; + std::shared_ptr< + std::map>>> + event_info_; }; } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 039f9191ace725..af3afa66873452 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -98,11 +98,6 @@ NewIRInterpreter::~NewIRInterpreter() { gc_.reset(nullptr); async_work_queue_.reset(); VLOG(4) << "~NewIRInterpreter(): " << this << " on " << place_; - - for (InstructionBase* instr : vec_instruction_base_) { - delete instr; - } - #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working @@ -277,7 +272,7 @@ FetchList NewIRInterpreter::BetaRun(const std::vector& feed_names, // from gpu_pinned place to gpu place on compute stream. for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - InstructionBase* inst = vec_instruction_base_[i]; + InstructionBase* inst = vec_instruction_base_[i].get(); if (inst->Name() == "pd.memcpy_d2h" && platform::is_gpu_place(place_)) { for (auto& item : inst->Inputs()) { for (auto var_id : item.second) { @@ -1640,14 +1635,14 @@ void NewIRInterpreter::BuildInstruction() { continue; } vec_instruction_base_.emplace_back( - new PhiKernelInstruction(op_idx++, - place_, - (*it), - scope_, - local_scope_, - value_2_var_name_, - var_name_2_id_, - variable_2_var_name_)); + std::make_shared(op_idx++, + place_, + (*it), + scope_, + local_scope_, + value_2_var_name_, + var_name_2_id_, + variable_2_var_name_)); } else { PADDLE_THROW(platform::errors::Unimplemented( "Now only support pd_kernel dialect.")); @@ -1676,7 +1671,7 @@ void NewIRInterpreter::BuildInstructionDependences() { auto downstream_map = ir_dependency_builder_.Build(vec_instruction_base_); for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { - InstructionBase* cur_instr = vec_instruction_base_[instr_id]; + InstructionBase* cur_instr = vec_instruction_base_[instr_id].get(); const std::set& next_instr_ids = downstream_map[instr_id]; if (FLAGS_new_executor_serial_run) { diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index a14313af4bb668..a6f28619fba54b 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -207,7 +207,7 @@ class NewIRInterpreter : public InterpreterBaseImpl { std::unique_ptr<::ir::Program> ir_program_{nullptr}; - std::vector vec_instruction_base_; + std::vector> vec_instruction_base_; std::unordered_map<::ir::Value, std::string> value_2_var_name_; @@ -218,9 +218,9 @@ class NewIRInterpreter : public InterpreterBaseImpl { std::vector variable_list_; - interpreter::IrDependencyBuilder ir_dependency_builder_; + interpreter::NewIrDependencyBuilder ir_dependency_builder_; - interpreter::IrStreamAnalyzer ir_stream_analyzer_; + interpreter::NewIrStreamAnalyzer ir_stream_analyzer_; }; } // namespace framework From a9d03d6bac0330bd8aba603dfb164348feebd3aa Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Thu, 27 Jul 2023 13:40:46 +0000 Subject: [PATCH 16/18] refine code --- .../interpreter/stream_analyzer.cc | 69 +++++++++++++++++-- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 5ab2e2fff163a7..74e4d76a9af55a 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -214,8 +214,7 @@ bool has_data_dependency(T1* cur_instr, T1* next_instr) { if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { VLOG(6) << "Found data dependency from " << "cur_instr(" << cur_instr->Id() << ") to " - << "next_instr(" << next_instr->Id() << ") at variable " - << item.first << "(" << next_var_id << ")"; + << "next_instr(" << next_instr->Id() << ")"; return true; } } @@ -237,8 +236,7 @@ bool has_data_dependency(T1* cur_instr, T1* next_instr) { if (cur_var_ids.find(next_var_id) != cur_var_ids.end()) { VLOG(6) << "Found data dependency from " << "cur_instr(" << cur_instr->Id() << ") to " - << "next_instr(" << next_instr->Id() << ") at variable " - << item.first << "(" << next_var_id << ")"; + << "next_instr(" << next_instr->Id() << ")"; return true; } } @@ -258,7 +256,7 @@ DownstreamRunType analyse_run_type_for_two_instructions(T* cur_instr, // npu d2h kernel is asynchronous. if (platform::is_custom_place(place)) { - if (interpreter::IsCpuOp(cur_instr) || + if (platform::is_cpu_place(cur_instr->DeviceContext().GetPlace()) || interpreter::IsMemcpyH2D(next_instr)) { return DownstreamRunType::kDirectRun; } @@ -372,6 +370,67 @@ void analyse_event_info_for_two_instructions( } } +template <> +void analyse_event_info_for_two_instructions< + paddle::framework::InstructionBase>( + const std::vector& instructions, + const std::vector>>& run_type_info, + const size_t cur_instr_id, + const size_t next_instr_id, + std::set* waiter_instr_ids, + std::set* visited_next_instr_id) { + if (visited_next_instr_id->find(next_instr_id) != + visited_next_instr_id->end()) { + return; + } + visited_next_instr_id->insert(next_instr_id); + + // NOTE(Ruibiao): Though depend_op as next_instr is no_need_buffer, we should + // also wait event for it. Because depend_op is used to build dependencies for + // fused vars in some scenarios. In those cases, we do not know which vars may + // lead a implicit data dependency. For example, + // ### + // ### fused_var = fuse_op(var0, ...) + // ### var1 = op1(fused_var) + // ### var0 = depend_op(var0, fused_var) + // ### var2 = op2(var0) + // ### + // If op1 are cross-stream with depend_op and op2, then we have: + // ### + // ### event_run : op1 -> depend_op + // ### direct_run : depend_op -> op2 + // ### + // There is actually a data dependency between op1 and op2 that var0 and + // fused_var share the same tensor. However, as the dependency is implicit, we + // can only add event for it with the help of depend_op. + + if (has_data_dependency( + instructions[cur_instr_id], instructions[next_instr_id]) || + !run_type_info[next_instr_id][DownstreamRunType::kEventRun].empty() || + instructions[next_instr_id]->Name() == "pd.depend") { + waiter_instr_ids->insert(next_instr_id); + return; + } + + // NOTE(Ruibiao): If no data dependency from cur_instr to next_instr, and + // simultaneously next_instr has no event_run downstream instr, we try to + // recursively add events between cur_instr and next_instr's + // direct-run-instrs. This can delay the event wait and achieve better + // scheduling performance in some scenarios. However, when next_instr has too + // many direct-run-instrs, it may perform worse than add event directly + // between cur_instr and next_instr. + for (size_t instr_id : + run_type_info[next_instr_id][DownstreamRunType::kDirectRun]) { + analyse_event_info_for_two_instructions( + instructions, + run_type_info, + cur_instr_id, + instr_id, + waiter_instr_ids, + visited_next_instr_id); + } +} + template void analyse_all_event_info( const std::vector& instructions, From 7fcfadba2a77cfcc75637715a61a4a4faf3cb045 Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 28 Jul 2023 02:24:31 +0000 Subject: [PATCH 17/18] refine code --- .../interpreter/dependency_builder.cc | 23 +++++++-------- .../interpreter/dependency_builder.h | 6 ++-- .../interpreter/stream_analyzer.cc | 29 +++++++++---------- .../interpreter/stream_analyzer.h | 2 +- .../new_executor/new_ir_interpreter.cc | 11 +++++-- .../new_executor/new_ir_interpreter.h | 2 +- 6 files changed, 35 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index fcdb95d86923b7..639885b80e5344 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -381,10 +381,8 @@ void DependencyBuilder::AddDownstreamOp(size_t prior_op_idx, VLOG(8) << prior_op_idx << "->" << posterior_op_idx; VLOG(8) << "Add dependency from " - << instructions_->at(prior_op_idx).OpBase()->Type() << "(" - << prior_op_idx << ") to " - << instructions_->at(posterior_op_idx).OpBase()->Type() << "(" - << posterior_op_idx << ")"; + << "prior_op_idx(" << prior_op_idx << ") to " + << "posterior_op_idx(" << posterior_op_idx << ")"; } void DependencyBuilder::BuildDownstreamMap() { @@ -549,23 +547,22 @@ void DependencyBuilder::UpdateVarMinRwOp( /// ======================== /// /// For new ir /// /// ======================== /// -NewIrDependencyBuilder::NewIrDependencyBuilder() : instructions_(nullptr) { +NewIrDependencyBuilder::NewIrDependencyBuilder() { is_build_ = false; op_downstream_map_ = std::make_shared>>(); op_happens_before_ = std::make_shared>>(); } const std::map>& NewIrDependencyBuilder::Build( - const std::vector>& - instructions) { + std::vector instructions) { if (is_build_) { return *op_downstream_map_; } std::tie(op_downstream_map_, op_happens_before_) = GetDependency(); - instructions_ = &instructions; - op_num_ = instructions_->size(); + instructions_ = instructions; + op_num_ = instructions_.size(); ops_before_.assign(op_num_, {}); ops_behind_.assign(op_num_, {}); @@ -616,7 +613,7 @@ void NewIrDependencyBuilder::BuildDownstreamMap() { remove_duplicate.clear(); // step1: update the op2dependences structure for (auto& item : - instructions_->at(op_idx)->Inputs()) { // for all inputs(read only) + instructions_.at(op_idx)->Inputs()) { // for all inputs(read only) for (auto var : item.second) { if (var2recent_write_op.count(var)) op2dependences[op_idx].insert(var2recent_write_op[var]); @@ -624,7 +621,7 @@ void NewIrDependencyBuilder::BuildDownstreamMap() { } for (auto& item : - instructions_->at(op_idx)->Outputs()) { // for all write vars + instructions_.at(op_idx)->Outputs()) { // for all write vars for (auto var : item.second) { if (var2min_rw_op.count(var)) { for (auto dep_op : var2min_rw_op[var]) { @@ -636,7 +633,7 @@ void NewIrDependencyBuilder::BuildDownstreamMap() { // step2: update 2 var2xxxx data structure for (auto& item : - instructions_->at(op_idx)->Outputs()) { // for all write vars + instructions_.at(op_idx)->Outputs()) { // for all write vars for (auto var : item.second) { var2recent_write_op[var] = op_idx; var2min_rw_op[var] = {static_cast(op_idx)}; @@ -645,7 +642,7 @@ void NewIrDependencyBuilder::BuildDownstreamMap() { } for (auto& item : - instructions_->at(op_idx)->Inputs()) { // for all inputs(read only) + instructions_.at(op_idx)->Inputs()) { // for all inputs(read only) for (auto var : item.second) { if (remove_duplicate.count(var) == 0) { // var in input list and in output list, so remove it. diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h index ee04bd75201b47..2593b11a2e48a5 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h @@ -110,14 +110,12 @@ class NewIrDependencyBuilder : public DependencyBuilder { // build op dependencies and return the mapping from op to its downstream-op // set const std::map>& Build( - const std::vector>& - instructions); + std::vector instructions); void BuildDownstreamMap(); private: - const std::vector>* - instructions_; // not_own + std::vector instructions_; // not_owned }; } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 74e4d76a9af55a..a83f02a8617318 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -569,27 +569,24 @@ void StreamAnalyzer::ShareEventInfoFrom(const StreamAnalyzer& src) { /// For new ir /// /// ======================== /// void NewIrStreamAnalyzer::ConstructEvents( - std::vector>* + const std::vector>& instructions) { if (!is_event_info_build_) { - std::vector> - cross_step_merged_instructions = *instructions; - for (auto instr : *instructions) { - cross_step_merged_instructions.emplace_back(instr); - } - std::vector cross_step_merged_instructions_ptr; - for (auto instr : cross_step_merged_instructions) { - cross_step_merged_instructions_ptr.push_back(instr.get()); + for (auto& instr : instructions) { + cross_step_merged_instructions_ptr.emplace_back(instr.get()); + } + for (auto& instr : instructions) { + cross_step_merged_instructions_ptr.emplace_back(instr.get()); } NewIrDependencyBuilder dependency_builder; - dependency_builder.Build(cross_step_merged_instructions); + dependency_builder.Build(cross_step_merged_instructions_ptr); const std::map>& downstream_map = dependency_builder.OpDownstreamMap(); - const size_t instr_num = cross_step_merged_instructions.size(); + const size_t instr_num = cross_step_merged_instructions_ptr.size(); std::vector>> run_type_info( instr_num, std::vector>( @@ -613,20 +610,20 @@ void NewIrStreamAnalyzer::ConstructEvents( size_t waiter_instr_id = waiter_item.first; std::set& recorder_instr_ids = waiter_item.second; - if (waiter_instr_id >= instructions->size()) { - waiter_instr_id -= instructions->size(); + if (waiter_instr_id >= instructions.size()) { + waiter_instr_id -= instructions.size(); } for (size_t recorder_instr_id : recorder_instr_ids) { // Redundant record - if (recorder_instr_id >= instructions->size()) { + if (recorder_instr_id >= instructions.size()) { continue; } paddle::framework::InstructionBase* recorder_instr = - instructions->at(recorder_instr_id).get(); + instructions.at(recorder_instr_id).get(); paddle::framework::InstructionBase* waiter_instr = - instructions->at(waiter_instr_id).get(); + instructions.at(waiter_instr_id).get(); platform::DeviceType waiter_type = GetWaiterType(waiter_instr); if (instr2event.find(recorder_instr_id) == instr2event.end()) { diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h index aa314b54eae33d..8c7d2d5b6ddbca 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h @@ -132,7 +132,7 @@ class NewIrStreamAnalyzer { ~NewIrStreamAnalyzer() {} void ConstructEvents( - std::vector>* + const std::vector>& instructions); platform::DeviceType GetWaiterType( diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index af3afa66873452..a6f1d27a112079 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -267,7 +267,7 @@ FetchList NewIRInterpreter::BetaRun(const std::vector& feed_names, BuildInstructionDependences(); - ir_stream_analyzer_.ConstructEvents(&vec_instruction_base_); + ir_stream_analyzer_.ConstructEvents(vec_instruction_base_); // add event for the input var of jit program, since there are async copied // from gpu_pinned place to gpu place on compute stream. for (size_t i = 0; i < dependecy_count_.size(); ++i) { @@ -1635,7 +1635,7 @@ void NewIRInterpreter::BuildInstruction() { continue; } vec_instruction_base_.emplace_back( - std::make_shared(op_idx++, + std::make_unique(op_idx++, place_, (*it), scope_, @@ -1668,7 +1668,12 @@ void NewIRInterpreter::BuildInstructionDependences() { // instr, and set the dependecy_count_ size_t instr_num = vec_instruction_base_.size(); dependecy_count_ = std::vector(instr_num, 0); - auto downstream_map = ir_dependency_builder_.Build(vec_instruction_base_); + + std::vector instructions_ptr; + for (auto& instr : vec_instruction_base_) { + instructions_ptr.push_back(instr.get()); + } + auto downstream_map = ir_dependency_builder_.Build(instructions_ptr); for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { InstructionBase* cur_instr = vec_instruction_base_[instr_id].get(); diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index a6f28619fba54b..8011811c44f2fc 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -207,7 +207,7 @@ class NewIRInterpreter : public InterpreterBaseImpl { std::unique_ptr<::ir::Program> ir_program_{nullptr}; - std::vector> vec_instruction_base_; + std::vector> vec_instruction_base_; std::unordered_map<::ir::Value, std::string> value_2_var_name_; From c3bff0996a8ca0a71bcfa5a758853010e212030e Mon Sep 17 00:00:00 2001 From: zhangbo9674 Date: Fri, 28 Jul 2023 02:33:04 +0000 Subject: [PATCH 18/18] fix bug --- .../framework/new_executor/instruction/instruction_base.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index a893ec67d9fe97..7452990a1d9076 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -45,9 +45,9 @@ class InstructionBase { OpFuncType KernelType() const; void SetKernelType(OpFuncType type) { type_ = type; } - int GetStreamPriority() const { return scheduling_priority_; } - void SetStreamPriority(SchedulingPriority scheduling_priority) { - scheduling_priority_ = scheduling_priority; + int GetStreamPriority() const { return stream_priority_; } + void SetStreamPriority(int stream_priority) { + stream_priority_ = stream_priority; } SchedulingPriority GetSchedulingPriority() const {