From 06d6edeb7ceb4ce66d33004f276993d11af15a97 Mon Sep 17 00:00:00 2001 From: phlrain Date: Fri, 28 Jul 2023 13:34:36 +0000 Subject: [PATCH 01/22] new ir remove fetch list --- .../new_executor/standalone_executor.cc | 33 ++++++++-- .../new_executor/standalone_executor.h | 2 + .../ir/phi_kernel_adaptor/phi_kernel_util.cc | 22 +++++-- .../ir/phi_kernel_adaptor/phi_kernel_util.h | 66 ++++++++----------- 4 files changed, 74 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index c80f9b36ff98ba..5d2845f9ca6f32 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -65,6 +65,19 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, if (FLAGS_enable_new_ir_in_executor) { VLOG(6) << "begin to translate" << std::endl; auto base_program = paddle::TranslateLegacyProgramToProgram(*program); + + auto block = base_program->block(); + for (auto it = block->begin(); it != block->end(); ++it) { + if ((*it)->name() == "pd.fetch") { + fetch_var_names_.push_back((*it) + ->attributes() + .at("name") + .dyn_cast() + .AsString() + + "@fetch"); + } + } + auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place); interpretercores_.emplace_back(std::make_shared( @@ -130,11 +143,23 @@ paddle::framework::FetchList StandaloneExecutor::Run( } // return Fetch Tensors - auto* fetch_var = scope_->FindVar(interpreter::kFetchVarName); - if (fetch_var) { - return std::move(*fetch_var->GetMutable()); + + if (FLAGS_enable_new_ir_in_executor) { + framework::FetchList fetch_res; + + for (auto& var_name : fetch_var_names_) { + auto* var = scope_->FindVar(var_name); + fetch_res.push_back(var->Get()); + } + + return fetch_res; } else { - return {}; + auto* fetch_var = scope_->FindVar(interpreter::kFetchVarName); + if (fetch_var) { + return std::move(*fetch_var->GetMutable()); + } else { + return {}; + } } } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.h b/paddle/fluid/framework/new_executor/standalone_executor.h index 0302128d9263da..1da628fe27bb79 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.h +++ b/paddle/fluid/framework/new_executor/standalone_executor.h @@ -50,6 +50,8 @@ class StandaloneExecutor { std::vector> interpretercores_; Scope* scope_; + + std::vector fetch_var_names_; }; } // namespace framework diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index 0388ee9791a35b..a04c6b46230174 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -187,13 +187,21 @@ void HandleForSpecialOp( if (op_name == "pd.fetch") { // fetch is a very special op, with no output - auto var = const_cast(inner_scope->root()) - ->Var("fetch"); - VLOG(6) << "Create var: fetch in scope " << inner_scope->root(); - auto fetch_list = var->GetMutable(); - int index = - op->attributes().at("col").dyn_cast().data(); - fetch_list->resize(index + 1); + auto fetch_src_name = + op->attributes().at("name").dyn_cast().AsString(); + + auto fetch_var_name = fetch_src_name + "@fetch"; + auto* var = const_cast(inner_scope->root()) + ->Var(fetch_var_name); + var->GetMutable(); + auto value = op->result(0); + + value_2_var_name->emplace(value, fetch_var_name); + + auto id = var_name_2_id->size(); + var_name_2_id->emplace(fetch_var_name, id); + variable_list->push_back(var); + variable_2_var_name->emplace(var, fetch_var_name); } if (op_name == "pd.feed") { diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h index f3021ad4765321..27b09a8b6c3b52 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h @@ -285,47 +285,37 @@ void BuildPhiContext(ir::Operation* op, } // TODO(phlrain): use var type instead of op name - if (op->attributes().count("op_name") && - (op->attributes().at("op_name").dyn_cast().AsString() == - "pd.fetch")) { - // process fetch op - auto fetch_var = inner_scope->FindVar("fetch"); - auto* fetch_list = fetch_var->GetMutable(); - int index = - op->attributes().at("col").dyn_cast().data(); - auto* out_tensor = &(PADDLE_GET(phi::DenseTensor, fetch_list->at(index))); - ctx->EmplaceBackOutput(out_tensor); - } else { - for (size_t i = 0; i < op->num_results(); ++i) { - ir::Value out_ptr = op->result(i); - auto name = name_map.at(out_ptr); - VLOG(6) << "ctx->EmplaceBackOutput: " << name; - auto out_type = out_ptr.type(); - if (!out_type) { - phi::DenseTensor* ptr = nullptr; - OutType out_ptr(ptr); - ctx->EmplaceBackOutput(out_ptr); - } else if (out_type.isa()) { - ctx->EmplaceBackOutput(OutType(const_cast( - &(inner_scope->FindVar(name)->Get())))); - } else if (out_type.isa()) { - ctx->EmplaceBackOutput(OutType(const_cast( - &(inner_scope->FindVar(name)->Get())))); - } else if (out_type.isa()) { - OutListType outputs; - auto& variable_array = - scope->FindVar(name)->Get(); - for (size_t i = 0; i < variable_array.size(); ++i) { - outputs.emplace_back(OutType(const_cast( - &(variable_array[i]->Get())))); - } - ctx->EmplaceBackOutputs(outputs); - } else { - PADDLE_THROW( - phi::errors::Unimplemented("only support DenseTensor and vector ")); + + for (size_t i = 0; i < op->num_results(); ++i) { + ir::Value out_ptr = op->result(i); + auto name = name_map.at(out_ptr); + VLOG(6) << "ctx->EmplaceBackOutput: " << name; + auto out_type = out_ptr.type(); + if (!out_type) { + phi::DenseTensor* ptr = nullptr; + OutType out_ptr(ptr); + ctx->EmplaceBackOutput(out_ptr); + } else if (out_type.isa()) { + ctx->EmplaceBackOutput(OutType(const_cast( + &(inner_scope->FindVar(name)->Get())))); + } else if (out_type.isa()) { + ctx->EmplaceBackOutput(OutType(const_cast( + &(inner_scope->FindVar(name)->Get())))); + } else if (out_type.isa()) { + OutListType outputs; + auto& variable_array = + scope->FindVar(name)->Get(); + for (size_t i = 0; i < variable_array.size(); ++i) { + outputs.emplace_back(OutType(const_cast( + &(variable_array[i]->Get())))); } + ctx->EmplaceBackOutputs(outputs); + } else { + PADDLE_THROW( + phi::errors::Unimplemented("only support DenseTensor and vector ")); } } + VLOG(6) << "Done build phi context"; } From 774196e0e89721aecf41c6b7e4ceec6c2cc82597 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 29 Jul 2023 13:50:25 +0000 Subject: [PATCH 02/22] fix pattern rewrite bug --- .../framework/new_executor/interpretercore.cc | 12 +++-- .../framework/new_executor/interpretercore.h | 3 ++ .../new_executor/new_ir_interpreter.cc | 50 ++++++++++++------- .../new_executor/new_ir_interpreter.h | 3 ++ .../new_executor/standalone_executor.cc | 30 ++++++++--- .../ir/transforms/constant_folding_pass.cc | 22 +++++++- 6 files changed, 89 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 837814b5f9dee6..04e1457f33dcbf 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -47,13 +47,15 @@ InterpreterCore::InterpreterCore(const platform::Place& place, place, block, scope, execution_config); } -InterpreterCore::InterpreterCore(const platform::Place& place, - std::unique_ptr<::ir::Program> ir_prog, - framework::Scope* scope, - const ExecutionConfig& execution_config) { +InterpreterCore::InterpreterCore( + const platform::Place& place, + const std::vector& fetch_var_names, + std::unique_ptr<::ir::Program> ir_prog, + framework::Scope* scope, + const ExecutionConfig& execution_config) { VLOG(4) << "InterpreterCore(): " << this << " on " << place; impl_ = std::make_unique( - place, std::move(ir_prog), scope, execution_config); + place, fetch_var_names, std::move(ir_prog), scope, execution_config); } InterpreterCore::~InterpreterCore() { diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index b9c633272e677a..66f998bb557f6e 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -37,6 +37,7 @@ class InterpreterCore { const ExecutionConfig& execution_config = ExecutionConfig()); // This constructor is for New IR. InterpreterCore(const platform::Place& place, + const std::vector& fetch_var_names, std::unique_ptr<::ir::Program> ir_prog, Scope* scope, const ExecutionConfig& execution_config = ExecutionConfig()); @@ -80,6 +81,8 @@ class InterpreterCore { DISABLE_COPY_AND_ASSIGN(InterpreterCore); std::unique_ptr impl_; + + std::vector fetch_var_names_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 3cdc815a562ae4..da7a959c4a8aaf 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -42,16 +42,19 @@ namespace paddle { namespace framework { -NewIRInterpreter::NewIRInterpreter(const platform::Place& place, - std::unique_ptr<::ir::Program> ir_prog, - framework::Scope* scope, - const ExecutionConfig& execution_config) +NewIRInterpreter::NewIRInterpreter( + const platform::Place& place, + const std::vector& fetch_var_names, + std::unique_ptr<::ir::Program> ir_prog, + framework::Scope* scope, + const ExecutionConfig& execution_config) : place_(place), stream_analyzer_(place), execution_config_(execution_config), var_scope_(scope), scope_(scope), - ir_program_(std::move(ir_prog)) { + ir_program_(std::move(ir_prog)), + fetch_var_names_(fetch_var_names) { VLOG(4) << "NewIRInterpreter(): " << this << " on " << place_; static_build_ = FLAGS_new_executor_static_build && !FLAGS_new_executor_use_cuda_graph && @@ -228,20 +231,33 @@ FetchList NewIRInterpreter::Run(const std::vector& feed_names, // return Fetch Tensors Scope* inner_scope = InnerScope(); - auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName); - if (fetch_var && need_fetch) { - auto fetch_list = std::move(*fetch_var->GetMutable()); -#ifdef PADDLE_WITH_CUDA - if (platform::IsCUDAGraphCapturing()) { - PADDLE_ENFORCE_EQ(fetch_list.empty(), - true, - platform::errors::InvalidArgument( - "Cannot fetch data when using CUDA Graph.")); + if (FLAGS_enable_new_ir_in_executor) { + framework::FetchList fetch_res; + + if (need_fetch) { + for (auto& var_name : fetch_var_names_) { + auto* var = inner_scope->FindVar(var_name); + fetch_res.push_back(var->Get()); + } } -#endif - return fetch_list; + return fetch_res; } else { - return {}; + auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName); + if (fetch_var && need_fetch) { + auto fetch_list = + std::move(*fetch_var->GetMutable()); +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_list.empty(), + true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + } +#endif + return fetch_list; + } else { + return {}; + } } } diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index 14c8d1778c288e..744a130a1aa048 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -34,6 +34,7 @@ class NewIRInterpreter : public InterpreterBaseImpl { public: NewIRInterpreter(const platform::Place& place, + const std::vector& fetch_var_names, std::unique_ptr<::ir::Program> ir_prog, Scope* scope, const ExecutionConfig& execution_config = ExecutionConfig()); @@ -217,6 +218,8 @@ class NewIRInterpreter : public InterpreterBaseImpl { std::vector variable_list_; interpreter::IrDependencyBuilder ir_dependency_builder_; + + std::vector fetch_var_names_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 5d2845f9ca6f32..0e6292f0b1bf4e 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -69,19 +69,33 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, auto block = base_program->block(); for (auto it = block->begin(); it != block->end(); ++it) { if ((*it)->name() == "pd.fetch") { - fetch_var_names_.push_back((*it) - ->attributes() - .at("name") - .dyn_cast() - .AsString() + - "@fetch"); + size_t index = (*it) + ->attributes() + .at("col") + .dyn_cast() + .data(); + + if (fetch_var_names_.size() < index + 1) { + fetch_var_names_.resize(index + 1); + } + + fetch_var_names_[index] = (*it) + ->attributes() + .at("name") + .dyn_cast() + .AsString() + + "@fetch"; } } auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place); - interpretercores_.emplace_back(std::make_shared( - place_, std::move(kernel_program), scope_, execution_config)); + interpretercores_.emplace_back( + std::make_shared(place_, + fetch_var_names_, + std::move(kernel_program), + scope_, + execution_config)); } else { interpretercores_.emplace_back( std::make_shared(place_, diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc index 3fcdee6748b206..5f107af71e519a 100644 --- a/paddle/fluid/ir/transforms/constant_folding_pass.cc +++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc @@ -71,15 +71,35 @@ class ConstantFoldingPattern : public ir::RewritePattern { ir::Program* program = op->GetParentProgram(); auto temp_program = BuildProgramFromOperation(op); + std::vector fetch_var_names; + auto block = temp_program->block(); + for (auto it = block->begin(); it != block->end(); ++it) { + if ((*it)->name() == "pd.fetch") { + size_t index = + (*it)->attributes().at("col").dyn_cast().data(); + + if (fetch_var_names.size() < index + 1) { + fetch_var_names.resize(index + 1); + } + + fetch_var_names[index] = (*it) + ->attributes() + .at("name") + .dyn_cast() + .AsString() + + "@fetch"; + } + } + // Execute program paddle::framework::interpreter::ExecutionConfig exe_config; exe_config.create_local_scope = false; paddle::framework::InterpreterCore core( phi::CPUPlace{}, + fetch_var_names, paddle::dialect::PdOpLowerToKernelPass(temp_program.get()), &scope_, exe_config); - paddle::framework::FetchList fetch_list = core.Run({}); // TODO(liuyuanle): Support multiple output. From 7e60294e45d5f1f90701d180b61e33949bfd2ae4 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sun, 30 Jul 2023 13:36:35 +0000 Subject: [PATCH 03/22] try to remove constant fold --- test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc index ebb6144753e2aa..da7b46acf8c2ab 100644 --- a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc +++ b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc @@ -1087,7 +1087,7 @@ TEST(pattern_rewrite, Patterns) { ir::PassManager pm(ctx); pm.AddPass(std::make_unique()); - pm.AddPass(ir::CreateConstantFoldingPass()); + // pm.AddPass(ir::CreateConstantFoldingPass()); pm.AddPass(ir::CreateDeadCodeEliminationPass()); pm.EnablePassTiming(); pm.EnableIRPrinting(); From 4b38badb7bc9f777c622d99e0e561a9ce8ff5519 Mon Sep 17 00:00:00 2001 From: phlrain Date: Mon, 31 Jul 2023 09:43:51 +0000 Subject: [PATCH 04/22] revert code --- test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc index da7b46acf8c2ab..ebb6144753e2aa 100644 --- a/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc +++ b/test/cpp/ir/pattern_rewrite/pattern_rewrite_test.cc @@ -1087,7 +1087,7 @@ TEST(pattern_rewrite, Patterns) { ir::PassManager pm(ctx); pm.AddPass(std::make_unique()); - // pm.AddPass(ir::CreateConstantFoldingPass()); + pm.AddPass(ir::CreateConstantFoldingPass()); pm.AddPass(ir::CreateDeadCodeEliminationPass()); pm.EnablePassTiming(); pm.EnableIRPrinting(); From 354b1f9adec2240c57e5fb887581fe16e0c47c59 Mon Sep 17 00:00:00 2001 From: phlrain Date: Mon, 31 Jul 2023 14:34:10 +0000 Subject: [PATCH 05/22] add pattern rewrite test flag --- test/cpp/ir/pattern_rewrite/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/cpp/ir/pattern_rewrite/CMakeLists.txt b/test/cpp/ir/pattern_rewrite/CMakeLists.txt index fd527db555003e..2023cc0cf413f3 100644 --- a/test/cpp/ir/pattern_rewrite/CMakeLists.txt +++ b/test/cpp/ir/pattern_rewrite/CMakeLists.txt @@ -7,3 +7,7 @@ endif() cc_test_old(pattern_rewrite_test SRCS pattern_rewrite_test.cc DEPS ${PATTERN_REWRITE_TEST_DEPS}) + +set_tests_properties( + pattern_rewrite_test PROPERTIES ENVIRONMENT + "FLAGS_enable_new_ir_in_executor=true") From edf3ce286ba9e9f0ca036ec7a5dd76c3265862b9 Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 1 Aug 2023 04:04:28 +0000 Subject: [PATCH 06/22] fix multi fetch --- .../framework/new_executor/standalone_executor.cc | 12 ++++++++++++ .../fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 0e6292f0b1bf4e..a89db7b22e18de 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -90,12 +90,24 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place); + interpretercores_.emplace_back( std::make_shared(place_, fetch_var_names_, std::move(kernel_program), scope_, execution_config)); + + // NOTE(phlrain): why we add prefix here. In earger op test, + // different test case use same scope (not same standalone executor), + // we must add prefix to prevent fetch same variable in different case + std::stringstream pre_ss; + pre_ss << interpretercores_.back()->Impl(); + + for (size_t i = 0; i < fetch_var_names_.size(); ++i) { + fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i]; + } + } else { interpretercores_.emplace_back( std::make_shared(place_, diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index d5de1abcf7a2f6..81464874e2a932 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -190,7 +190,8 @@ void HandleForSpecialOp( auto fetch_src_name = op->attributes().at("name").dyn_cast().AsString(); - auto fetch_var_name = fetch_src_name + "@fetch"; + auto fetch_var_name = var_name_prefix + "_" + fetch_src_name + "@fetch"; + std::cerr << "fetch var name " << fetch_var_name << std::endl; auto* var = const_cast(inner_scope->root()) ->Var(fetch_var_name); var->GetMutable(); From c7206c1ee6dfe3ff25d0d9734401df6fb649021b Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 1 Aug 2023 05:36:47 +0000 Subject: [PATCH 07/22] remove usless code --- paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index 81464874e2a932..3704f33632f9ff 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -191,7 +191,6 @@ void HandleForSpecialOp( op->attributes().at("name").dyn_cast().AsString(); auto fetch_var_name = var_name_prefix + "_" + fetch_src_name + "@fetch"; - std::cerr << "fetch var name " << fetch_var_name << std::endl; auto* var = const_cast(inner_scope->root()) ->Var(fetch_var_name); var->GetMutable(); From b80e3d4393fb15e9607e33599b99e6b6c08cf0f1 Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 1 Aug 2023 06:33:41 +0000 Subject: [PATCH 08/22] new ir support legacy kernel instraction --- .../new_executor/instruction/CMakeLists.txt | 1 + .../instruction/instruction_base.cc | 184 +++++++++++++++++ .../instruction/instruction_base.h | 27 +++ .../instruction/legacy_kernel_instruction.cc | 184 +++++++++++++++++ .../instruction/legacy_kernel_instruction.h | 71 +++++++ .../instruction/phi_kernel_instruction.cc | 186 +----------------- .../instruction/phi_kernel_instruction.h | 8 - .../new_executor/new_ir_interpreter.cc | 34 +++- .../new_executor/standalone_executor.cc | 4 +- .../fused_softmax_mask_upper_triangle_op.cu | 1 + test/legacy_test/eager_op_test.py | 2 + ...est_softmax_mask_fuse_upper_triangle_op.py | 47 ++--- 12 files changed, 528 insertions(+), 221 deletions(-) create mode 100644 paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc create mode 100644 paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt index 8d9a93757d3099..88064749eaf027 100644 --- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt @@ -1,4 +1,5 @@ cc_library( instruction_base SRCS instruction_base.cc phi_kernel_instruction.cc + legacy_kernel_instruction.cc DEPS phi framework_proto) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index 6c09d7aa2a13fd..11f9e4071fe8fc 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -16,6 +16,10 @@ #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/ir/core/builtin_attribute.h" + namespace paddle { namespace framework { @@ -93,5 +97,185 @@ void InstructionBase::SetOutputs( output_index_ = outputs; } +std::vector InstructionBase::GetValueIds( + ir::Value value, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name) { + std::vector ids; + std::string var_name = value_2_var_name.at(value); + ids.push_back(var_name_2_id.at(var_name)); + // NOTE(zhangbo): Value maybe a VariableRefArray + auto var = inner_scope->FindVar(var_name); + if (var->IsType()) { + auto& var_array = var->Get(); + for (size_t i = 0; i < var_array.size(); ++i) { + ids.push_back(var_name_2_id.at(variable_2_var_name.at(var_array[i]))); + } + } + return ids; +} + +void InstructionBase::InitInputsOutputsIds( + ::ir::Operation* op, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name, + const std::string& op_name) { + std::unordered_map> inputs; + for (size_t i = 0; i < op->num_operands(); i++) { + ir::Value value = op->operand(i); + if (value) { + PADDLE_ENFORCE_NE( + value_2_var_name.find(value), + value_2_var_name.end(), + phi::errors::PreconditionNotMet( + "input should in name map, [%d] 'th input of [%s] op", + i, + op_name)); + std::vector inputs_id = GetValueIds(value, + inner_scope, + value_2_var_name, + var_name_2_id, + variable_2_var_name); + inputs.emplace(value, inputs_id); + } + } + SetInputs(inputs); + VLOG(8) << "finish process inputs_index"; + std::unordered_map> outputs; + for (size_t i = 0; i < op->num_results(); i++) { + ir::Value value = op->result(i); + if (value) { + PADDLE_ENFORCE_NE( + value_2_var_name.find(value), + value_2_var_name.end(), + phi::errors::PreconditionNotMet( + "input should in name map, [%d] 'th input of [%s] op", + i, + op_name)); + std::vector outputs_id = GetValueIds(value, + inner_scope, + value_2_var_name, + var_name_2_id, + variable_2_var_name); + outputs.emplace(value, outputs_id); + } + } + SetOutputs(outputs); + VLOG(8) << "finish process outputs_index"; +} + +platform::DeviceContext* InstructionBase::ParseDeviceContext( + ir::Operation* op, + platform::DeviceContext* origin_dev_ctx, + const platform::Place& place, + const std::string& execution_stream, + const int stream_priority) { + auto op_attributes = op->attributes(); + auto op_name = + op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); + interpreter::ContextManager& ctx_manager = + interpreter::ContextManager::Instance(); + + platform::DeviceContext* dev_ctx = nullptr; + + // only gpu need update. xpu not need, because xpu memcpy op kernel is + // synchronous. + if (platform::is_gpu_place(place) || platform::is_custom_place(place)) { + VLOG(6) << "Parse DeviceContext for " << op_name + << ", execution stream = " << execution_stream; + if (execution_stream != kDefaultStream) { + dev_ctx = ctx_manager + .Get(std::string(kCustomStream) + "-" + execution_stream, + place, + stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } + + if (op_name == interpreter::kMemcpyD2H) { + dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } else if (op_name == interpreter::kMemcpyH2D) { + dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum + // with use_cal_stream==false by returning a device context getting from the + // global NCCLCommContext instance. Because when use_calc_stream==false, in + // OP kernel, the NCCL communication will be launched to the stream directly + // getting from the global NCCLCommContext instance rather than the + // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in + // c_allreduce_op.h). Now it is just a temporary solution for ONLY + // c_allreduce_sum which is used in ResNet50 distributed training. + if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream") + .dyn_cast<::ir::BoolAttribute>() + .data() == false) { + int ring_id = + op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); + return platform::NCCLCommContext::Instance() + .Get(ring_id, place) + ->dev_context(); + } +#endif + } + + if (origin_dev_ctx != nullptr) { + interpreter::SetDeviceCommContext(op, origin_dev_ctx); + } + return origin_dev_ctx; +} + +OpFuncType InstructionBase::AnalyseOpFuncType(ir::Operation* op, + const platform::Place& place) { + if (platform::is_cpu_place(place)) { + return OpFuncType::kCpuSync; + } + + PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place), + true, + phi::errors::Fatal("Unsupported current place %s", place)); + + // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU + // computing. They execute serially in device thread and block CUDA kernel + // launching in other GPU OPs. To improve performance, set them as kGpuSync + // and so that they would be dispatched to host thread. + auto op_attributes = op->attributes(); + auto op_name = + op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); + if (op_name == kCoalesceTensor && + (!platform::is_xpu_place(place) || + op->attribute("persist_output").data() == false) && + op->attribute("set_constant").data() == false && + op->attribute("copy_data").data() == false) { + return OpFuncType::kGpuSync; + } + + // for memcpy explicitly called by user + if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) { + return OpFuncType::kGpuSync; + } + + if (op_name == "shape") { + return OpFuncType::kGpuSync; + } + return OpFuncType::kGpuAsync; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index 7452990a1d9076..11d1f3e2f3eae8 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -137,7 +137,34 @@ class InstructionBase { virtual const std::string& Name() const = 0; + protected: + OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place); + + platform::DeviceContext* ParseDeviceContext( + ir::Operation* op, + platform::DeviceContext* origin_dev_ctx, + const platform::Place& place, + const std::string& execution_stream, + const int stream_priority); + + void InitInputsOutputsIds( + ::ir::Operation* op, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name, + const std::string& op_name); + private: + std::vector GetValueIds( + ir::Value value, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name); + size_t id_; bool is_artificial_; // Instruction is artificial means that it is only used diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc new file mode 100644 index 00000000000000..81690f45f463f1 --- /dev/null +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc @@ -0,0 +1,184 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h" + +#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/ir/dialect/pd_dialect.h" +#include "paddle/fluid/ir/interface/infermeta.h" +#include "paddle/fluid/ir/interface/op_yaml_info.h" +#include "paddle/fluid/ir/interface/op_yaml_info_parser.h" +#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/core/type_defs.h" + +#include "paddle/ir/core/builtin_attribute.h" +#include "paddle/ir/core/operation.h" +#include "paddle/ir/core/value.h" + +namespace paddle { +namespace framework { + +LegacyKernelInstruction::LegacyKernelInstruction( + size_t id, + const platform::Place& place, + ir::Operation* op, + Scope* scope, + Scope* local_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name) + : InstructionBase(id, place) { + auto op_attributes = op->attributes(); + auto op_name = + op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); + ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name); + + legacy_op_name_ = op_name; + VLOG(6) << "construct phi kernel instruction for: " << legacy_op_name_; + + // Todo: support paddle::dialect::DistAttribute + // if (op_attributes.count("dist_attr") != 0) { + // if (op_attributes.count("execution_stream") != 0) { + // SetExecutionStream(op_attributes.at("execution_stream") + // .dyn_cast<::ir::StrAttribute>() + // .data()); + // } + // if (op_attributes.count("stream_priority") != 0) { + // SetStreamPriority(op_attributes.at("stream_priority") + // .dyn_cast<::ir::Int32Attribute>() + // .data()); + // } + // if (op_attributes.count("scheduling_priority") != 0) { + // SetSchedulingPriority(op_attributes.at("scheduling_priority") + // .dyn_cast<::ir::Int64Attribute>() + // .data()); + // } + // } else { + // if (interpreter::IsCommunicationOp(op)) { + // // NOTE(Ruibiao): Dispatching computation before communication + // improves + // // multi-stream overlap when the time cost of communication less than + // // that of the calculation (e.g., ResNet50_bs128_pure_fp16 N4C32 + // // training). + // op_func_node.scheduling_priority_ = 1; + // } + // } + VLOG(6) << "finish process dist attributes"; + + SetKernelType(AnalyseOpFuncType(op, place)); + VLOG(6) << "finish process analyse kernel type"; + + infer_meta_interface_ = + op_info.GetInterfaceImpl(); + VLOG(6) << "finish process infer_meta_interface_"; + + auto yaml_interface = + op_info.GetInterfaceImpl(); + PADDLE_ENFORCE_NOT_NULL( + yaml_interface, + phi::errors::PreconditionNotMet( + "can not find OpYamlInfoInterface from [%s]", legacy_op_name_)); + paddle::dialect::OpYamlInfoParser yaml_info_parser( + yaml_interface->get_op_info_()); + VLOG(6) << "finish process yaml_info_parser"; + + ::ir::BuildPhiContext< + phi::InferMetaContext, + phi::MetaTensor, + phi::MetaTensor, + paddle::small_vector, + paddle::small_vector, + false>(op, + value_2_var_name, + scope, + local_scope, + yaml_info_parser, + &infer_meta_context_); + VLOG(6) << "finish process infer meta context"; + + auto kernel_name = + op_attributes.at("kernel_name").dyn_cast().AsString(); + auto kernel_key = op_attributes.at("kernel_key") + .dyn_cast() + .data(); + auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError( + kernel_name, kernel_key); + phi_kernel_ = new phi::Kernel(kernel_result.kernel); + PADDLE_ENFORCE_EQ( + phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name); + VLOG(6) << "finish process select kernel"; + + operator_base_ = + ir::BuildOperatorBase(op, value_2_var_name, yaml_info_parser); + paddle::framework::VariableValueMap in_map; + paddle::framework::VariableValueMap out_map; + auto dev_ctx = phi::DeviceContextPool::Instance().Get( + phi::TransToPhiPlace(kernel_key.backend())); + + runtime_context_ = std::make_shared( + paddle::framework::RuntimeContext(in_map, out_map)); + ir::BuildRuntimeContext(op, + value_2_var_name, + scope, + local_scope, + yaml_info_parser, + runtime_context_.get()); + kernel_context_ = std::make_shared( + paddle::framework::ExecutionContext( + *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get()))); + + VLOG(6) << "finish process kernel context"; + SetDeviceContext( + ParseDeviceContext(op, + phi::DeviceContextPool::Instance().Get( + phi::TransToPhiPlace(kernel_key.backend())), + place, + GetExecutionStream(), + GetStreamPriority())); + VLOG(6) << "finish process device context"; + + Scope* inner_scope = local_scope == nullptr ? scope : local_scope; + InitInputsOutputsIds(op, + inner_scope, + value_2_var_name, + var_name_2_id, + variable_2_var_name, + legacy_op_name_); + VLOG(6) << "finish process inputs outputs index"; + + auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds(); + std::unordered_set<::ir::Value> no_need_buffer_values; + for (size_t id = 0; id < no_need_buffer_ids.size(); id++) { + no_need_buffer_values.insert(op->operand(no_need_buffer_ids[id])); + } + SetNoNeedBuffer(no_need_buffer_values); + VLOG(6) << "finish process no need buffer"; +} + +void LegacyKernelInstruction::Run() { + infer_meta_interface_->infer_meta_(&(infer_meta_context_)); + VLOG(6) << "Run op " << legacy_op_name_ << " infer meta."; + (*(phi_kernel_))((kernel_context_.get())); + VLOG(6) << "Run op " << legacy_op_name_ << " kernel."; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h new file mode 100644 index 00000000000000..a8a150fbb6c776 --- /dev/null +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h @@ -0,0 +1,71 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" + +namespace ir { +class Operation; +} // namespace ir + +namespace paddle { +namespace framework { +class Scope; +class Value; + +class LegacyKernelInstruction : public InstructionBase { + public: + LegacyKernelInstruction( + size_t id, + const platform::Place& place, + ::ir::Operation* op, + Scope* scope, + Scope* local_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name); + + phi::Kernel* PhiKernel() const { return phi_kernel_; } + + const phi::InferMetaContext& InferMetaContext() const { + return infer_meta_context_; + } + + paddle::dialect::InferMetaInterface::Concept* InferMetaInterface() const { + return infer_meta_interface_; + } + + void Run() override; + + const std::string& Name() const override { return legacy_op_name_; } + + private: + std::string legacy_op_name_; + + paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{ + nullptr}; // not owned + + phi::InferMetaContext infer_meta_context_; + + std::shared_ptr runtime_context_; + std::shared_ptr operator_base_; + std::shared_ptr kernel_context_; + + phi::Kernel* phi_kernel_{nullptr}; // not owned +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index 39e791aca3f8ac..4e73418d5abd6b 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -35,112 +35,6 @@ namespace paddle { namespace framework { -platform::DeviceContext* ParseDeviceContext( - ir::Operation* op, - platform::DeviceContext* origin_dev_ctx, - const platform::Place& place, - const std::string& execution_stream, - const int stream_priority) { - auto op_attributes = op->attributes(); - auto op_name = - op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); - interpreter::ContextManager& ctx_manager = - interpreter::ContextManager::Instance(); - - platform::DeviceContext* dev_ctx = nullptr; - - // only gpu need update. xpu not need, because xpu memcpy op kernel is - // synchronous. - if (platform::is_gpu_place(place) || platform::is_custom_place(place)) { - VLOG(6) << "Parse DeviceContext for " << op_name - << ", execution stream = " << execution_stream; - if (execution_stream != kDefaultStream) { - dev_ctx = ctx_manager - .Get(std::string(kCustomStream) + "-" + execution_stream, - place, - stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } - - if (op_name == interpreter::kMemcpyD2H) { - dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } else if (op_name == interpreter::kMemcpyH2D) { - dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum - // with use_cal_stream==false by returning a device context getting from the - // global NCCLCommContext instance. Because when use_calc_stream==false, in - // OP kernel, the NCCL communication will be launched to the stream directly - // getting from the global NCCLCommContext instance rather than the - // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in - // c_allreduce_op.h). Now it is just a temporary solution for ONLY - // c_allreduce_sum which is used in ResNet50 distributed training. - if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream") - .dyn_cast<::ir::BoolAttribute>() - .data() == false) { - int ring_id = - op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); - return platform::NCCLCommContext::Instance() - .Get(ring_id, place) - ->dev_context(); - } -#endif - } - - if (origin_dev_ctx != nullptr) { - interpreter::SetDeviceCommContext(op, origin_dev_ctx); - } - return origin_dev_ctx; -} - -OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) { - if (platform::is_cpu_place(place)) { - return OpFuncType::kCpuSync; - } - - PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place), - true, - phi::errors::Fatal("Unsupported current place %s", place)); - - // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU - // computing. They execute serially in device thread and block CUDA kernel - // launching in other GPU OPs. To improve performance, set them as kGpuSync - // and so that they would be dispatched to host thread. - auto op_attributes = op->attributes(); - auto op_name = - op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); - if (op_name == kCoalesceTensor && - (!platform::is_xpu_place(place) || - op->attribute("persist_output").data() == false) && - op->attribute("set_constant").data() == false && - op->attribute("copy_data").data() == false) { - return OpFuncType::kGpuSync; - } - - // for memcpy explicitly called by user - if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) { - return OpFuncType::kGpuSync; - } - - if (op_name == "shape") { - return OpFuncType::kGpuSync; - } - return OpFuncType::kGpuAsync; -} - PhiKernelInstruction::PhiKernelInstruction( size_t id, const platform::Place& place, @@ -256,8 +150,12 @@ PhiKernelInstruction::PhiKernelInstruction( VLOG(6) << "finish process device context"; Scope* inner_scope = local_scope == nullptr ? scope : local_scope; - InitInputsOutputsIds( - op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name); + InitInputsOutputsIds(op, + inner_scope, + value_2_var_name, + var_name_2_id, + variable_2_var_name, + phi_op_name_); VLOG(6) << "finish process inputs outputs index"; auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds(); @@ -269,78 +167,6 @@ PhiKernelInstruction::PhiKernelInstruction( VLOG(6) << "finish process no need buffer"; } -std::vector GetValueIds( - ir::Value value, - Scope* inner_scope, - const std::unordered_map<::ir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) { - std::vector ids; - std::string var_name = value_2_var_name.at(value); - ids.push_back(var_name_2_id.at(var_name)); - // NOTE(zhangbo): Value maybe a VariableRefArray - auto var = inner_scope->FindVar(var_name); - if (var->IsType()) { - auto& var_array = var->Get(); - for (size_t i = 0; i < var_array.size(); ++i) { - ids.push_back(var_name_2_id.at(variable_2_var_name.at(var_array[i]))); - } - } - return ids; -} - -void PhiKernelInstruction::InitInputsOutputsIds( - ::ir::Operation* op, - Scope* inner_scope, - const std::unordered_map<::ir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) { - std::unordered_map> inputs; - for (size_t i = 0; i < op->num_operands(); i++) { - ir::Value value = op->operand(i); - if (value) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), - phi::errors::PreconditionNotMet( - "input should in name map, [%d] 'th input of [%s] op", - i, - phi_op_name_)); - std::vector inputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); - inputs.emplace(value, inputs_id); - } - } - SetInputs(inputs); - VLOG(8) << "finish process inputs_index"; - std::unordered_map> outputs; - for (size_t i = 0; i < op->num_results(); i++) { - ir::Value value = op->result(i); - if (value) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), - phi::errors::PreconditionNotMet( - "input should in name map, [%d] 'th input of [%s] op", - i, - phi_op_name_)); - std::vector outputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); - outputs.emplace(value, outputs_id); - } - } - SetOutputs(outputs); - VLOG(8) << "finish process outputs_index"; -} - void PhiKernelInstruction::Run() { infer_meta_interface_->infer_meta_(&(infer_meta_context_)); VLOG(6) << "Run op " << phi_op_name_ << " infer meta."; diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h index b30fa8bff751b5..fcd35a3b762904 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h @@ -55,14 +55,6 @@ class PhiKernelInstruction : public InstructionBase { const std::string& Name() const override { return phi_op_name_; } private: - void InitInputsOutputsIds( - ::ir::Operation* op, - Scope* inner_scope, - const std::unordered_map<::ir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name); - std::string phi_op_name_; paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{ diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index b9060cb16e0d88..78208f30b8e239 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -36,6 +36,7 @@ #include "paddle/fluid/platform/flags.h" #include "paddle/phi/backends/device_manager.h" +#include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h" #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h" #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" @@ -1645,15 +1646,30 @@ void NewIRInterpreter::BuildInstruction() { VLOG(6) << "skip process " << op_name; continue; } - vec_instruction_base_.emplace_back( - std::make_unique(op_idx++, - place_, - (*it), - scope_, - local_scope_, - value_2_var_name_, - var_name_2_id_, - variable_2_var_name_)); + + if (op_name == "pd.fused_softmax_mask_upper_triangle" || + op_name == "pd.fused_softmax_mask_upper_triangle_grad") { + std::cerr << "emplace lagcy kernel " << op_name << std::endl; + vec_instruction_base_.emplace_back( + std::make_unique(op_idx++, + place_, + (*it), + scope_, + local_scope_, + value_2_var_name_, + var_name_2_id_, + variable_2_var_name_)); + } else { + vec_instruction_base_.emplace_back( + std::make_unique(op_idx++, + place_, + (*it), + scope_, + local_scope_, + value_2_var_name_, + var_name_2_id_, + variable_2_var_name_)); + } } else { PADDLE_THROW(platform::errors::Unimplemented( "Now only support pd_kernel dialect.")); diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index c80f9b36ff98ba..1f15aa6e12f648 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -65,8 +65,10 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, if (FLAGS_enable_new_ir_in_executor) { VLOG(6) << "begin to translate" << std::endl; auto base_program = paddle::TranslateLegacyProgramToProgram(*program); + base_program->Print(std::cout); auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place); + kernel_program->Print(std::cout); interpretercores_.emplace_back(std::make_shared( place_, std::move(kernel_program), scope_, execution_config)); } else { @@ -126,7 +128,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( interpretercores_[job_idx]->ShareBuildResultsFrom( interpretercores_[type_to_first_id[job_type]]); } - interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false); + interpretercores_[job_idx]->BetaRun(feed_names, /*need_fetch = */ false); } // return Fetch Tensors diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index 779ee234071af0..05a82db2826385 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -475,6 +475,7 @@ template class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + std::cerr << "comute grad " << std::endl; auto* grad_x = context.Output(framework::GradVarName("X")); auto* grad_y = diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py index f34ff8e137da6f..175cc7346f39f9 100644 --- a/test/legacy_test/eager_op_test.py +++ b/test/legacy_test/eager_op_test.py @@ -2682,6 +2682,7 @@ def check_grad_with_place( max_relative_error = ( 0.001 if max_relative_error < 0.001 else max_relative_error ) + print("grad", analytic_grads) self._assert_is_close( numeric_grads, analytic_grads, @@ -3039,6 +3040,7 @@ def _get_gradient( compiled_prog = fluid.CompiledProgram(prog, build_strategy) prog = compiled_prog executor = fluid.Executor(place) + print(prog) res = list( map( np.array, diff --git a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py index 82dbaaf0e78c46..217a96281e02c2 100644 --- a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py +++ b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py @@ -18,7 +18,6 @@ from eager_op_test import OpTest import paddle -from paddle import fluid, incubate from paddle.fluid import core paddle.enable_static() @@ -48,37 +47,38 @@ def setUp(self): rst = _get_softmax_upper(x) self.outputs = {'Out': rst} - def test_check_output(self): - self.check_output_with_place(core.CUDAPlace(0)) + # def test_check_output(self): + # self.check_output_with_place(core.CUDAPlace(0)) def test_check_grad(self): self.check_grad_with_place(core.CUDAPlace(0), ["X"], "Out") -@unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -) -class TestSoftmaxMaskFuseOp1(OpTest): - def setUp(self): - self.op_type = "fused_softmax_mask_upper_triangle" - x = np.random.random((1, 4, 32, 32)) - self.inputs = {'X': x} - rst = _get_softmax_upper(x) - self.outputs = {'Out': rst} +# @unittest.skipIf( +# not core.is_compiled_with_cuda(), "core is not compiled with CUDA" +# ) +# class TestSoftmaxMaskFuseOp1(OpTest): +# def setUp(self): +# self.op_type = "fused_softmax_mask_upper_triangle" +# x = np.random.random((1, 4, 32, 32)) +# self.inputs = {'X': x} +# rst = _get_softmax_upper(x) +# self.outputs = {'Out': rst} - def test_check_output(self): - try: - self.check_output_with_place(core.CPUPlace()) - except (NotImplementedError, RuntimeError): - pass +# def test_check_output(self): +# try: +# self.check_output_with_place(core.CPUPlace()) +# except (NotImplementedError, RuntimeError): +# pass - def test_check_grad(self): - try: - self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") - except (NotImplementedError, RuntimeError): - pass +# def test_check_grad(self): +# try: +# self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") +# except (NotImplementedError, RuntimeError): +# pass +''' @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) @@ -117,6 +117,7 @@ def test_dygraph(self): rst = incubate.softmax_mask_fuse_upper_triangle(input_x) np.testing.assert_allclose(rst, rst_np, rtol=1e-05) +''' if __name__ == '__main__': unittest.main() From 755eace533310f942a808c5fedf5fad7c5a4c41b Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 1 Aug 2023 09:27:55 +0000 Subject: [PATCH 09/22] new ir support legacy kernel instruction --- .../new_executor/instruction/instruction_base.h | 9 ++++++--- .../fluid/framework/new_executor/new_ir_interpreter.cc | 5 +++++ .../operators/fused_softmax_mask_upper_triangle_op.cu | 2 ++ paddle/phi/kernels/impl/fetch_impl.h | 2 ++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index 11d1f3e2f3eae8..0af596ea50a694 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -21,6 +21,8 @@ #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/platform/event.h" +#include "paddle/ir/core/operation.h" +#include "paddle/ir/core/value.h" namespace ir { class Value; @@ -138,10 +140,11 @@ class InstructionBase { virtual const std::string& Name() const = 0; protected: - OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place); + OpFuncType AnalyseOpFuncType(::ir::Operation* op, + const platform::Place& place); platform::DeviceContext* ParseDeviceContext( - ir::Operation* op, + ::ir::Operation* op, platform::DeviceContext* origin_dev_ctx, const platform::Place& place, const std::string& execution_stream, @@ -158,7 +161,7 @@ class InstructionBase { private: std::vector GetValueIds( - ir::Value value, + ::ir::Value value, Scope* inner_scope, const std::unordered_map<::ir::Value, std::string>& value_2_var_name, const std::map& var_name_2_id, diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 7e914cd4332212..67c1c4da0b848d 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1875,6 +1875,10 @@ void NewIRInterpreter::CalculateLastLiveOps() { ins.begin(), ins.end()}; ins_and_outs.insert(outs.begin(), outs.end()); + if (instr->Name() != "pd.fetch") { + ins_and_outs.insert(outs.begin(), outs.end()); + } + for (auto& item : ins_and_outs) { for (auto var_id : item.second) { // skip no_need_buffer input vars @@ -2117,6 +2121,7 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) { VLOG(5) << "begin to run op " << instr_node->Name(); if (!instr_node->IsArtificial()) { + std::cerr << "op name " << instr_node->Name() << std::endl; instr_node->Run(); VLOG(4) << "done instruction node run"; CheckGC(instr_node); diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index 05a82db2826385..c098a11537a07a 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -476,6 +476,7 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { std::cerr << "comute grad " << std::endl; + auto* grad_x = context.Output(framework::GradVarName("X")); auto* grad_y = @@ -486,6 +487,7 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel { auto* grad_y_data = grad_y->data(); auto* softmax_rst_data = softmax_rst->data(); + std::cerr << "grad x" << grad_x->dims() << std::endl; auto y_dim = grad_y->dims(); auto batches = y_dim[0]; auto attn_heads = y_dim[1]; diff --git a/paddle/phi/kernels/impl/fetch_impl.h b/paddle/phi/kernels/impl/fetch_impl.h index d90a813e4a16b3..3769f58c424c28 100644 --- a/paddle/phi/kernels/impl/fetch_impl.h +++ b/paddle/phi/kernels/impl/fetch_impl.h @@ -21,7 +21,9 @@ namespace phi { template void FetchKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + std::cerr << "fetch out " << out->dims() << std::endl; phi::Copy(ctx, x, phi::CPUPlace(), true, out); + std::cerr << "fetch out " << out->data()[0] << std::endl; } } // namespace phi From adf66ea51ce4a6e888bf6bd08685b7692ddf7ffb Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 1 Aug 2023 10:23:35 +0000 Subject: [PATCH 10/22] add scope prefix --- .../new_executor/interpreter_base_impl.h | 3 +++ .../framework/new_executor/interpretercore.cc | 9 +++++++++ .../framework/new_executor/interpretercore.h | 3 +++ .../framework/new_executor/new_ir_interpreter.cc | 16 +++++++++++++--- .../framework/new_executor/new_ir_interpreter.h | 4 ++++ .../new_executor/program_interpreter.cc | 9 +++++++++ .../framework/new_executor/program_interpreter.h | 3 +++ .../new_executor/standalone_executor.cc | 5 ++--- .../fluid/ir/transforms/constant_folding_pass.cc | 4 +++- 9 files changed, 49 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index 586aef975a643c..ab75b56b96def8 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -108,6 +108,9 @@ class InterpreterBaseImpl { virtual const interpreter::StreamAnalyzer& GetStreamAnalyzer() const = 0; virtual bool IsSharedResultsBuild() const = 0; + + virtual void SetScopePrefix(const std::string& prefix) = 0; + virtual const std::string& GetScopePrefix() const = 0; }; inline void SetDeviceId(const platform::Place& place) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 04e1457f33dcbf..edf2ca6666d46c 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -125,5 +125,14 @@ const platform::Place& InterpreterCore::GetPlace() const { void InterpreterCore::SetOutputHooks(const std::vector& hookfuncs) { impl_->SetOutputHooks(hookfuncs); } + +void InterpreterCore::SetScopePrefix(const std::string& prefix) { + impl_->SetScopePrefix(prefix); +} + +const std::string& InterpreterCore::GetScopePrefix() const { + return impl_->GetScopePrefix(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 66f998bb557f6e..db26215b2ffc06 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -77,6 +77,9 @@ class InterpreterCore { void SetOutputHooks(const std::vector& hookfuncs); + void SetScopePrefix(const std::string& prefix); + const std::string& GetScopePrefix() const; + private: DISABLE_COPY_AND_ASSIGN(InterpreterCore); diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index ec5b7730b51754..c01392da706985 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -106,6 +106,10 @@ NewIRInterpreter::NewIRInterpreter( }; PrepareForCUDAGraphCapture(); + + std::stringstream ss; + ss << this; + scope_prefix_ = ss.str(); } NewIRInterpreter::~NewIRInterpreter() { @@ -200,11 +204,10 @@ FetchList NewIRInterpreter::Run(const std::vector& feed_names, if (!is_build_) { LOG_FIRST_N(INFO, 1) << "New Executor is Running."; - std::stringstream ss; - ss << this; + std::cerr << "scope prefix " << scope_prefix_ << std::endl; ::ir::BuildScope(*ir_program_->block(), InnerScope(), - ss.str(), + scope_prefix_, &value_2_var_name_, &variable_2_var_name_, &var_name_2_id_, @@ -2161,5 +2164,12 @@ void NewIRInterpreter::PreAnalysis() { VLOG(4) << "Done AnalyseExecuteOrderForTrace"; } +void NewIRInterpreter::SetScopePrefix(const std::string& prefix) { + scope_prefix_ = prefix; +} +const std::string& NewIRInterpreter::GetScopePrefix() const { + return scope_prefix_; +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index ef00957e7d8ca1..e62264e1ebd7fc 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -90,6 +90,9 @@ class NewIRInterpreter : public InterpreterBaseImpl { int GetIdByName(const std::string& name) const; + void SetScopePrefix(const std::string& prefix) override; + const std::string& GetScopePrefix() const override; + private: // build graph void Convert(std::vector* op_func_nodes); @@ -254,6 +257,7 @@ class NewIRInterpreter : public InterpreterBaseImpl { interpreter::NewIrStreamAnalyzer ir_stream_analyzer_; std::vector fetch_var_names_; + std::string scope_prefix_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 08ddd3444fd1c9..38c8ead21d73d5 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -1507,5 +1507,14 @@ void ProgramInterpreter::AnalyseExecuteOrderForTrace() { trace_execute_order_ = trace_order; } +void ProgramInterpreter::SetScopePrefix(const std::string& prefix) { + PADDLE_THROW(phi::errors::Unimplemented( + "Program Interpreter not support SetScopePrefix")); +} +const std::string& ProgramInterpreter::GetScopePrefix() const { + PADDLE_THROW(phi::errors::Unimplemented( + "Program Interpreter not support GetScopePrefix")); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index a942425609c189..890ed9fde6412f 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -85,6 +85,9 @@ class ProgramInterpreter : public InterpreterBaseImpl { hookfuncs_ = hookfuncs; } + void SetScopePrefix(const std::string& prefix) override; + const std::string& GetScopePrefix() const override; + private: // build graph void Convert(std::vector* op_func_nodes); diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index a89db7b22e18de..3ef42ff0727021 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -101,11 +101,10 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, // NOTE(phlrain): why we add prefix here. In earger op test, // different test case use same scope (not same standalone executor), // we must add prefix to prevent fetch same variable in different case - std::stringstream pre_ss; - pre_ss << interpretercores_.back()->Impl(); + auto prefix = interpretercores_.back()->Impl()->GetScopePrefix(); for (size_t i = 0; i < fetch_var_names_.size(); ++i) { - fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i]; + fetch_var_names_[i] = prefix + "_" + fetch_var_names_[i]; } } else { diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc index 5f107af71e519a..edba2300ca111d 100644 --- a/paddle/fluid/ir/transforms/constant_folding_pass.cc +++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc @@ -82,7 +82,8 @@ class ConstantFoldingPattern : public ir::RewritePattern { fetch_var_names.resize(index + 1); } - fetch_var_names[index] = (*it) + fetch_var_names[index] = "ConstantFoldPrefix_" + + (*it) ->attributes() .at("name") .dyn_cast() @@ -100,6 +101,7 @@ class ConstantFoldingPattern : public ir::RewritePattern { paddle::dialect::PdOpLowerToKernelPass(temp_program.get()), &scope_, exe_config); + core.SetScopePrefix("ConstantFoldPrefix"); paddle::framework::FetchList fetch_list = core.Run({}); // TODO(liuyuanle): Support multiple output. From e8f64bcd5356c40993914cc84da49bb906cfbcc4 Mon Sep 17 00:00:00 2001 From: phlrain Date: Wed, 2 Aug 2023 02:40:23 +0000 Subject: [PATCH 11/22] update --- .../framework/new_executor/instruction/instruction_base.cc | 1 + paddle/fluid/framework/new_executor/new_ir_interpreter.cc | 4 ++-- paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc | 3 +++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index 11f9e4071fe8fc..84da44e2b83f5a 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -151,6 +151,7 @@ void InstructionBase::InitInputsOutputsIds( for (size_t i = 0; i < op->num_results(); i++) { ir::Value value = op->result(i); if (value) { + std::cerr << "value " << value.impl() << std::endl; PADDLE_ENFORCE_NE( value_2_var_name.find(value), value_2_var_name.end(), diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 67c1c4da0b848d..b670fba7fa09f3 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1601,8 +1601,7 @@ void NewIRInterpreter::BuildInstruction() { .dyn_cast<::ir::StrAttribute>() .AsString(); if (op_name == "builtin.combine" || op_name == "builtin.slice" || - op_name == "pd.feed" || op_name == "pd.fetch" || - op_name == "builtin.set_parameter" || + op_name == "pd.feed" || op_name == "builtin.set_parameter" || op_name == "builtin.get_parameter") { VLOG(6) << "skip process " << op_name; continue; @@ -2086,6 +2085,7 @@ void NewIRInterpreter::TraceInstructionList( for (size_t idx = 0; idx < trace_execute_order_.size(); idx++) { auto instr_id = trace_execute_order_[idx]; + std::cerr << "instr id " << instr_id << std::endl; InstructionBase* instr_node = vec_instruction_base_.at(instr_id).get(); VLOG(6) << "Run InstructionBase " << instr_id; diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index 5dd27a04ad7cf0..0ba72c7ddcd701 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -80,6 +80,9 @@ paddle::framework::Variable* CreateVar( VLOG(6) << "Create var: " << name << " in scope " << inner_scope; var = inner_scope->Var(name); } + + std::cerr << "creater variable for value " << std::endl; + std::cerr << value.impl() << "\t" << name << std::endl; value_2_var_name->emplace(value, name); variable_2_var_name->emplace(var, name); auto id = var_name_2_id->size(); From e97beede65081988ddbc127b1378192f68e95ef1 Mon Sep 17 00:00:00 2001 From: phlrain Date: Wed, 2 Aug 2023 07:26:25 +0000 Subject: [PATCH 12/22] update --- .../instruction/instruction_base.cc | 1 - .../instruction/instruction_base.h | 1 + .../new_executor/new_ir_interpreter.cc | 3 -- paddle/phi/kernels/impl/fetch_impl.h | 2 - test/legacy_test/eager_op_test.py | 1 - ...est_softmax_mask_fuse_upper_triangle_op.py | 47 +++++++++---------- 6 files changed, 24 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index 612d5dfb85a72a..5de5d25e6efd29 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -150,7 +150,6 @@ void InstructionBase::InitInputsOutputsIds( for (size_t i = 0; i < op->num_results(); i++) { ir::Value value = op->result(i); if (value) { - std::cerr << "check value " << value.impl() << std::endl; PADDLE_ENFORCE_NE( value_2_var_name.find(value), value_2_var_name.end(), diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index eaa76e2a338db6..870b842241253f 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -203,6 +203,7 @@ class InstructionBase { std::unordered_set<::ir::Value> no_need_buffer_values_; + protected: std::string phi_op_name_; }; diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 02cd5ea40a20da..3916ffec2fa33d 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1891,7 +1891,6 @@ void NewIRInterpreter::CalculateLastLiveOps() { instr->Outputs(); std::unordered_multimap<::ir::Value, std::vector> ins_and_outs{ ins.begin(), ins.end()}; - ins_and_outs.insert(outs.begin(), outs.end()); if (instr->Name() != "pd.fetch") { ins_and_outs.insert(outs.begin(), outs.end()); @@ -2104,7 +2103,6 @@ void NewIRInterpreter::TraceInstructionList( for (size_t idx = 0; idx < trace_execute_order_.size(); idx++) { auto instr_id = trace_execute_order_[idx]; - std::cerr << "instr id " << instr_id << std::endl; InstructionBase* instr_node = vec_instruction_base_.at(instr_id).get(); VLOG(6) << "Run InstructionBase " << instr_id; @@ -2140,7 +2138,6 @@ void NewIRInterpreter::RunInstructionBase(InstructionBase* instr_node) { VLOG(5) << "begin to run op " << instr_node->Name(); if (!instr_node->IsArtificial()) { - std::cerr << "op name " << instr_node->Name() << std::endl; instr_node->Run(); VLOG(4) << "done instruction node run"; CheckGC(instr_node); diff --git a/paddle/phi/kernels/impl/fetch_impl.h b/paddle/phi/kernels/impl/fetch_impl.h index 3769f58c424c28..d90a813e4a16b3 100644 --- a/paddle/phi/kernels/impl/fetch_impl.h +++ b/paddle/phi/kernels/impl/fetch_impl.h @@ -21,9 +21,7 @@ namespace phi { template void FetchKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { - std::cerr << "fetch out " << out->dims() << std::endl; phi::Copy(ctx, x, phi::CPUPlace(), true, out); - std::cerr << "fetch out " << out->data()[0] << std::endl; } } // namespace phi diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py index e4830facaa3b1a..485416a3f04af3 100644 --- a/test/legacy_test/eager_op_test.py +++ b/test/legacy_test/eager_op_test.py @@ -2681,7 +2681,6 @@ def check_grad_with_place( max_relative_error = ( 0.001 if max_relative_error < 0.001 else max_relative_error ) - print("grad", analytic_grads) self._assert_is_close( numeric_grads, analytic_grads, diff --git a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py index 217a96281e02c2..82dbaaf0e78c46 100644 --- a/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py +++ b/test/legacy_test/test_softmax_mask_fuse_upper_triangle_op.py @@ -18,6 +18,7 @@ from eager_op_test import OpTest import paddle +from paddle import fluid, incubate from paddle.fluid import core paddle.enable_static() @@ -47,38 +48,37 @@ def setUp(self): rst = _get_softmax_upper(x) self.outputs = {'Out': rst} - # def test_check_output(self): - # self.check_output_with_place(core.CUDAPlace(0)) + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0)) def test_check_grad(self): self.check_grad_with_place(core.CUDAPlace(0), ["X"], "Out") -# @unittest.skipIf( -# not core.is_compiled_with_cuda(), "core is not compiled with CUDA" -# ) -# class TestSoftmaxMaskFuseOp1(OpTest): -# def setUp(self): -# self.op_type = "fused_softmax_mask_upper_triangle" -# x = np.random.random((1, 4, 32, 32)) -# self.inputs = {'X': x} -# rst = _get_softmax_upper(x) -# self.outputs = {'Out': rst} +@unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" +) +class TestSoftmaxMaskFuseOp1(OpTest): + def setUp(self): + self.op_type = "fused_softmax_mask_upper_triangle" + x = np.random.random((1, 4, 32, 32)) + self.inputs = {'X': x} + rst = _get_softmax_upper(x) + self.outputs = {'Out': rst} -# def test_check_output(self): -# try: -# self.check_output_with_place(core.CPUPlace()) -# except (NotImplementedError, RuntimeError): -# pass + def test_check_output(self): + try: + self.check_output_with_place(core.CPUPlace()) + except (NotImplementedError, RuntimeError): + pass -# def test_check_grad(self): -# try: -# self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") -# except (NotImplementedError, RuntimeError): -# pass + def test_check_grad(self): + try: + self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") + except (NotImplementedError, RuntimeError): + pass -''' @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) @@ -117,7 +117,6 @@ def test_dygraph(self): rst = incubate.softmax_mask_fuse_upper_triangle(input_x) np.testing.assert_allclose(rst, rst_np, rtol=1e-05) -''' if __name__ == '__main__': unittest.main() From 3eeec33bdb778fbbe0fa931c77d67fe6db957397 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 5 Aug 2023 11:23:30 +0000 Subject: [PATCH 13/22] update --- .../instruction/instruction_base.cc | 130 +----------------- .../instruction/instruction_base.h | 8 ++ .../instruction/instruction_util.h | 61 ++------ .../instruction/legacy_kernel_instruction.cc | 5 +- .../instruction/phi_kernel_instruction.cc | 1 + .../new_executor/interpreter_base_impl.h | 3 - .../framework/new_executor/interpretercore.cc | 8 -- .../framework/new_executor/interpretercore.h | 3 - .../new_executor/new_ir_interpreter.cc | 8 +- .../new_executor/new_ir_interpreter.h | 3 - .../new_executor/program_interpreter.cc | 9 -- .../new_executor/program_interpreter.h | 3 - .../ir/transforms/constant_folding_pass.cc | 1 - 13 files changed, 23 insertions(+), 220 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index 5de5d25e6efd29..5fd12551ff176c 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -13,6 +13,8 @@ // limitations under the License. #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" + +#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/platform/profiler/event_tracing.h" @@ -97,27 +99,6 @@ void InstructionBase::SetOutputs( output_index_ = outputs; } -std::vector InstructionBase::GetValueIds( - ir::Value value, - Scope* inner_scope, - const std::unordered_map<::ir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) { - std::vector ids; - std::string var_name = value_2_var_name.at(value); - ids.push_back(var_name_2_id.at(var_name)); - // NOTE(zhangbo): Value maybe a VariableRefArray - auto var = inner_scope->FindVar(var_name); - if (var->IsType()) { - auto& var_array = var->Get(); - for (size_t i = 0; i < var_array.size(); ++i) { - ids.push_back(var_name_2_id.at(variable_2_var_name.at(var_array[i]))); - } - } - return ids; -} - void InstructionBase::InitInputsOutputsIds( ::ir::Operation* op, Scope* inner_scope, @@ -169,112 +150,5 @@ void InstructionBase::InitInputsOutputsIds( VLOG(8) << "finish process outputs_index"; } -platform::DeviceContext* InstructionBase::ParseDeviceContext( - ir::Operation* op, - platform::DeviceContext* origin_dev_ctx, - const platform::Place& place, - const std::string& execution_stream, - const int stream_priority) { - auto op_attributes = op->attributes(); - auto op_name = - op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); - interpreter::ContextManager& ctx_manager = - interpreter::ContextManager::Instance(); - - platform::DeviceContext* dev_ctx = nullptr; - - // only gpu need update. xpu not need, because xpu memcpy op kernel is - // synchronous. - if (platform::is_gpu_place(place) || platform::is_custom_place(place)) { - VLOG(6) << "Parse DeviceContext for " << op_name - << ", execution stream = " << execution_stream; - if (execution_stream != kDefaultStream) { - dev_ctx = ctx_manager - .Get(std::string(kCustomStream) + "-" + execution_stream, - place, - stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } - - if (op_name == interpreter::kMemcpyD2H) { - dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } else if (op_name == interpreter::kMemcpyH2D) { - dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum - // with use_cal_stream==false by returning a device context getting from the - // global NCCLCommContext instance. Because when use_calc_stream==false, in - // OP kernel, the NCCL communication will be launched to the stream directly - // getting from the global NCCLCommContext instance rather than the - // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in - // c_allreduce_op.h). Now it is just a temporary solution for ONLY - // c_allreduce_sum which is used in ResNet50 distributed training. - if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream") - .dyn_cast<::ir::BoolAttribute>() - .data() == false) { - int ring_id = - op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); - return platform::NCCLCommContext::Instance() - .Get(ring_id, place) - ->dev_context(); - } -#endif - } - - if (origin_dev_ctx != nullptr) { - interpreter::SetDeviceCommContext(op, origin_dev_ctx); - } - return origin_dev_ctx; -} - -OpFuncType InstructionBase::AnalyseOpFuncType(ir::Operation* op, - const platform::Place& place) { - if (platform::is_cpu_place(place)) { - return OpFuncType::kCpuSync; - } - - PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place), - true, - phi::errors::Fatal("Unsupported current place %s", place)); - - // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU - // computing. They execute serially in device thread and block CUDA kernel - // launching in other GPU OPs. To improve performance, set them as kGpuSync - // and so that they would be dispatched to host thread. - auto op_attributes = op->attributes(); - auto op_name = - op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); - if (op_name == kCoalesceTensor && - (!platform::is_xpu_place(place) || - op->attribute("persist_output").data() == false) && - op->attribute("set_constant").data() == false && - op->attribute("copy_data").data() == false) { - return OpFuncType::kGpuSync; - } - - // for memcpy explicitly called by user - if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) { - return OpFuncType::kGpuSync; - } - - if (op_name == "shape") { - return OpFuncType::kGpuSync; - } - return OpFuncType::kGpuAsync; -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index cf98694ba7dad3..5ce2358a7df799 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -139,6 +139,14 @@ class InstructionBase { virtual const std::string& Name() const = 0; + void InitInputsOutputsIds( + ::ir::Operation* op, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name); + protected: size_t id_; diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h index 1905fcd66c9d8a..b1a431fe20f25e 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h @@ -20,10 +20,17 @@ #include #include "paddle/fluid/framework/new_executor/new_executor_defs.h" +#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/event.h" +#include "paddle/ir/core/builtin_attribute.h" #include "paddle/ir/core/operation.h" #include "paddle/ir/core/value.h" +#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" +#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" +#include "paddle/fluid/platform/collective_helper.h" + namespace paddle { namespace framework { @@ -48,57 +55,6 @@ std::vector GetValueIds( return ids; } -void PhiKernelInstruction::InitInputsOutputsIds( - ::ir::Operation* op, - Scope* inner_scope, - const std::unordered_map<::ir::Value, std::string>& value_2_var_name, - const std::map& var_name_2_id, - const std::unordered_map& - variable_2_var_name) { - std::unordered_map> inputs; - for (size_t i = 0; i < op->num_operands(); i++) { - ir::Value value = op->operand_source(i); - if (value) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), - phi::errors::PreconditionNotMet( - "input should in name map, [%d] 'th input of [%s] op", - i, - phi_op_name_)); - std::vector inputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); - inputs.emplace(value, inputs_id); - } - } - SetInputs(inputs); - VLOG(8) << "finish process inputs_index"; - std::unordered_map> outputs; - for (size_t i = 0; i < op->num_results(); i++) { - ir::Value value = op->result(i); - if (value) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), - phi::errors::PreconditionNotMet( - "input should in name map, [%d] 'th input of [%s] op", - i, - phi_op_name_)); - std::vector outputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); - outputs.emplace(value, outputs_id); - } - } - SetOutputs(outputs); - VLOG(8) << "finish process outputs_index"; -} - platform::DeviceContext* ParseDeviceContext( ir::Operation* op, platform::DeviceContext* origin_dev_ctx, @@ -170,7 +126,8 @@ platform::DeviceContext* ParseDeviceContext( return origin_dev_ctx; } -OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) { +OpFuncType AnalyseOpFuncType(::ir::Operation* op, + const platform::Place& place) { if (platform::is_cpu_place(place)) { return OpFuncType::kCpuSync; } diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc index b3a2b6236e3ee1..b3d013443055d7 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h" +#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" #include "paddle/fluid/framework/scope.h" @@ -28,10 +29,6 @@ #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/core/type_defs.h" -#include "paddle/ir/core/builtin_attribute.h" -#include "paddle/ir/core/operation.h" -#include "paddle/ir/core/value.h" - namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc index db6499aeaa3538..d5b7b5affc5d4b 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.cc @@ -32,6 +32,7 @@ #include "paddle/ir/core/operation.h" #include "paddle/ir/core/value.h" +#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/new_executor/interpreter_base_impl.h b/paddle/fluid/framework/new_executor/interpreter_base_impl.h index ab75b56b96def8..586aef975a643c 100644 --- a/paddle/fluid/framework/new_executor/interpreter_base_impl.h +++ b/paddle/fluid/framework/new_executor/interpreter_base_impl.h @@ -108,9 +108,6 @@ class InterpreterBaseImpl { virtual const interpreter::StreamAnalyzer& GetStreamAnalyzer() const = 0; virtual bool IsSharedResultsBuild() const = 0; - - virtual void SetScopePrefix(const std::string& prefix) = 0; - virtual const std::string& GetScopePrefix() const = 0; }; inline void SetDeviceId(const platform::Place& place) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index edf2ca6666d46c..9ee34fcc39c115 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -126,13 +126,5 @@ void InterpreterCore::SetOutputHooks(const std::vector& hookfuncs) { impl_->SetOutputHooks(hookfuncs); } -void InterpreterCore::SetScopePrefix(const std::string& prefix) { - impl_->SetScopePrefix(prefix); -} - -const std::string& InterpreterCore::GetScopePrefix() const { - return impl_->GetScopePrefix(); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index db26215b2ffc06..66f998bb557f6e 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -77,9 +77,6 @@ class InterpreterCore { void SetOutputHooks(const std::vector& hookfuncs); - void SetScopePrefix(const std::string& prefix); - const std::string& GetScopePrefix() const; - private: DISABLE_COPY_AND_ASSIGN(InterpreterCore); diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index 748cd65fceb250..002b9345ecb584 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -113,10 +113,6 @@ NewIRInterpreter::NewIRInterpreter( }; PrepareForCUDAGraphCapture(); - - std::stringstream ss; - ss << this; - scope_prefix_ = ss.str(); } NewIRInterpreter::~NewIRInterpreter() { @@ -1633,7 +1629,7 @@ void NewIRInterpreter::BuildInstruction() { vec_instruction_base_.emplace_back( std::make_unique(op_idx++, place_, - (*it), + op, scope_, local_scope_, value_2_var_name_, @@ -1643,7 +1639,7 @@ void NewIRInterpreter::BuildInstruction() { vec_instruction_base_.emplace_back( std::make_unique(op_idx++, place_, - (*it), + op, scope_, local_scope_, value_2_var_name_, diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.h b/paddle/fluid/framework/new_executor/new_ir_interpreter.h index ce46c986b30761..1388a0276e4655 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.h +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.h @@ -90,9 +90,6 @@ class NewIRInterpreter : public InterpreterBaseImpl { int GetIdByName(const std::string& name) const; - void SetScopePrefix(const std::string& prefix) override; - const std::string& GetScopePrefix() const override; - private: // build graph void Convert(std::vector* op_func_nodes); diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index c2ed5e7a42889b..9156c46dc6dc27 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -1505,14 +1505,5 @@ void ProgramInterpreter::AnalyseExecuteOrderForTrace() { trace_execute_order_ = trace_order; } -void ProgramInterpreter::SetScopePrefix(const std::string& prefix) { - PADDLE_THROW(phi::errors::Unimplemented( - "Program Interpreter not support SetScopePrefix")); -} -const std::string& ProgramInterpreter::GetScopePrefix() const { - PADDLE_THROW(phi::errors::Unimplemented( - "Program Interpreter not support GetScopePrefix")); -} - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 890ed9fde6412f..a942425609c189 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -85,9 +85,6 @@ class ProgramInterpreter : public InterpreterBaseImpl { hookfuncs_ = hookfuncs; } - void SetScopePrefix(const std::string& prefix) override; - const std::string& GetScopePrefix() const override; - private: // build graph void Convert(std::vector* op_func_nodes); diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc index eb6e19ac95f0db..0465a189d6f2e2 100644 --- a/paddle/fluid/ir/transforms/constant_folding_pass.cc +++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc @@ -100,7 +100,6 @@ class ConstantFoldingPattern : public ir::RewritePattern { paddle::dialect::PdOpLowerToKernelPass(temp_program.get()), &scope_, exe_config); - core.SetScopePrefix("ConstantFoldPrefix"); paddle::framework::FetchList fetch_list = core.Run({}); // TODO(liuyuanle): Support multiple output. From cdf32c7b276cc80ca7133e0ecbe0f8298c6b9453 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sat, 5 Aug 2023 14:59:41 +0000 Subject: [PATCH 14/22] update --- .../new_executor/instruction/CMakeLists.txt | 2 +- .../instruction/instruction_util.cc | 175 ++++++++++++++++++ .../instruction/instruction_util.h | 133 +------------ .../new_executor/new_ir_interpreter.cc | 2 +- .../new_executor/standalone_executor.cc | 10 + .../ir/transforms/pd_op_to_kernel_pass.cc | 9 +- 6 files changed, 197 insertions(+), 134 deletions(-) create mode 100644 paddle/fluid/framework/new_executor/instruction/instruction_util.cc diff --git a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt index 88064749eaf027..93356e6b217a0f 100644 --- a/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/instruction/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library( instruction_base SRCS instruction_base.cc phi_kernel_instruction.cc - legacy_kernel_instruction.cc + legacy_kernel_instruction.cc instruction_util.cc DEPS phi framework_proto) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.cc b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc new file mode 100644 index 00000000000000..d8ddc30633be07 --- /dev/null +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.cc @@ -0,0 +1,175 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h" + +#include "paddle/fluid/framework/new_executor/new_executor_defs.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/event.h" +#include "paddle/ir/core/builtin_attribute.h" +#include "paddle/ir/core/operation.h" +#include "paddle/ir/core/value.h" + +#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" +#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" +#include "paddle/fluid/platform/collective_helper.h" + +namespace paddle { +namespace framework { + +std::vector GetValueIds( + ir::Value value, + Scope* inner_scope, + const std::unordered_map<::ir::Value, std::string>& value_2_var_name, + const std::map& var_name_2_id, + const std::unordered_map& + variable_2_var_name) { + std::vector ids; + std::string var_name = value_2_var_name.at(value); + ids.push_back(var_name_2_id.at(var_name)); + // NOTE(zhangbo): Value maybe a VariableRefArray + auto var = inner_scope->FindVar(var_name); + if (var->IsType()) { + auto& var_array = var->Get(); + for (auto item : var_array) { + ids.push_back(var_name_2_id.at(variable_2_var_name.at(item))); + } + } + return ids; +} + +platform::DeviceContext* ParseDeviceContext( + ir::Operation* op, + platform::DeviceContext* origin_dev_ctx, + const platform::Place& place, + const std::string& execution_stream, + const int stream_priority) { + auto op_attributes = op->attributes(); + auto op_name = + op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); + interpreter::ContextManager& ctx_manager = + interpreter::ContextManager::Instance(); + + platform::DeviceContext* dev_ctx = nullptr; + + // only gpu need update. xpu not need, because xpu memcpy op kernel is + // synchronous. + if (platform::is_gpu_place(place) || platform::is_custom_place(place)) { + VLOG(6) << "Parse DeviceContext for " << op_name + << ", execution stream = " << execution_stream; + if (execution_stream != kDefaultStream) { + dev_ctx = ctx_manager + .Get(std::string(kCustomStream) + "-" + execution_stream, + place, + stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } + + if (op_name == interpreter::kMemcpyD2H) { + dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } else if (op_name == interpreter::kMemcpyH2D) { + dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority) + .get() + .get(); + interpreter::SetDeviceCommContext(op, dev_ctx); + return dev_ctx; + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum + // with use_cal_stream==false by returning a device context getting from the + // global NCCLCommContext instance. Because when use_calc_stream==false, in + // OP kernel, the NCCL communication will be launched to the stream directly + // getting from the global NCCLCommContext instance rather than the + // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in + // c_allreduce_op.h). Now it is just a temporary solution for ONLY + // c_allreduce_sum which is used in ResNet50 distributed training. + if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream") + .dyn_cast<::ir::BoolAttribute>() + .data() == false) { + int ring_id = + op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); + return platform::NCCLCommContext::Instance() + .Get(ring_id, place) + ->dev_context(); + } +#endif + } + + if (origin_dev_ctx != nullptr) { + interpreter::SetDeviceCommContext(op, origin_dev_ctx); + } + return origin_dev_ctx; +} + +OpFuncType AnalyseOpFuncType(::ir::Operation* op, + const platform::Place& place) { + if (platform::is_cpu_place(place)) { + return OpFuncType::kCpuSync; + } + + auto kernel_key = op->attributes() + .at("kernel_key") + .dyn_cast() + .data(); + if (phi::TransToPhiPlace(kernel_key.backend()).GetType() == + phi::AllocationType::CPU) { + return OpFuncType::kCpuSync; + } + + PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place), + true, + phi::errors::Fatal("Unsupported current place %s", place)); + + // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU + // computing. They execute serially in device thread and block CUDA kernel + // launching in other GPU OPs. To improve performance, set them as kGpuSync + // and so that they would be dispatched to host thread. + auto op_attributes = op->attributes(); + auto op_name = + op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); + if (op_name == kCoalesceTensor && + (!platform::is_xpu_place(place) || + op->attribute("persist_output").data() == false) && + op->attribute("set_constant").data() == false && + op->attribute("copy_data").data() == false) { + return OpFuncType::kGpuSync; + } + + // for memcpy explicitly called by user + if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) { + return OpFuncType::kGpuSync; + } + + if (op_name == "shape") { + return OpFuncType::kGpuSync; + } + return OpFuncType::kGpuAsync; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h index b1a431fe20f25e..3d0aa3df9de963 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h @@ -25,12 +25,6 @@ #include "paddle/ir/core/builtin_attribute.h" #include "paddle/ir/core/operation.h" #include "paddle/ir/core/value.h" - -#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" -#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h" -#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" -#include "paddle/fluid/platform/collective_helper.h" - namespace paddle { namespace framework { @@ -40,136 +34,15 @@ std::vector GetValueIds( const std::unordered_map<::ir::Value, std::string>& value_2_var_name, const std::map& var_name_2_id, const std::unordered_map& - variable_2_var_name) { - std::vector ids; - std::string var_name = value_2_var_name.at(value); - ids.push_back(var_name_2_id.at(var_name)); - // NOTE(zhangbo): Value maybe a VariableRefArray - auto var = inner_scope->FindVar(var_name); - if (var->IsType()) { - auto& var_array = var->Get(); - for (auto item : var_array) { - ids.push_back(var_name_2_id.at(variable_2_var_name.at(item))); - } - } - return ids; -} + variable_2_var_name); platform::DeviceContext* ParseDeviceContext( ir::Operation* op, platform::DeviceContext* origin_dev_ctx, const platform::Place& place, const std::string& execution_stream, - const int stream_priority) { - auto op_attributes = op->attributes(); - auto op_name = - op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); - interpreter::ContextManager& ctx_manager = - interpreter::ContextManager::Instance(); - - platform::DeviceContext* dev_ctx = nullptr; - - // only gpu need update. xpu not need, because xpu memcpy op kernel is - // synchronous. - if (platform::is_gpu_place(place) || platform::is_custom_place(place)) { - VLOG(6) << "Parse DeviceContext for " << op_name - << ", execution stream = " << execution_stream; - if (execution_stream != kDefaultStream) { - dev_ctx = ctx_manager - .Get(std::string(kCustomStream) + "-" + execution_stream, - place, - stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } - - if (op_name == interpreter::kMemcpyD2H) { - dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } else if (op_name == interpreter::kMemcpyH2D) { - dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority) - .get() - .get(); - interpreter::SetDeviceCommContext(op, dev_ctx); - return dev_ctx; - } - -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - // NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum - // with use_cal_stream==false by returning a device context getting from the - // global NCCLCommContext instance. Because when use_calc_stream==false, in - // OP kernel, the NCCL communication will be launched to the stream directly - // getting from the global NCCLCommContext instance rather than the - // DeviceContext passed from executor (see CAllReduceOpCUDAKernel in - // c_allreduce_op.h). Now it is just a temporary solution for ONLY - // c_allreduce_sum which is used in ResNet50 distributed training. - if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream") - .dyn_cast<::ir::BoolAttribute>() - .data() == false) { - int ring_id = - op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data(); - return platform::NCCLCommContext::Instance() - .Get(ring_id, place) - ->dev_context(); - } -#endif - } - - if (origin_dev_ctx != nullptr) { - interpreter::SetDeviceCommContext(op, origin_dev_ctx); - } - return origin_dev_ctx; -} - -OpFuncType AnalyseOpFuncType(::ir::Operation* op, - const platform::Place& place) { - if (platform::is_cpu_place(place)) { - return OpFuncType::kCpuSync; - } - - auto kernel_key = op->attributes() - .at("kernel_key") - .dyn_cast() - .data(); - if (phi::TransToPhiPlace(kernel_key.backend()).GetType() == - phi::AllocationType::CPU) { - return OpFuncType::kCpuSync; - } - - PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place), - true, - phi::errors::Fatal("Unsupported current place %s", place)); - - // Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU - // computing. They execute serially in device thread and block CUDA kernel - // launching in other GPU OPs. To improve performance, set them as kGpuSync - // and so that they would be dispatched to host thread. - auto op_attributes = op->attributes(); - auto op_name = - op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); - if (op_name == kCoalesceTensor && - (!platform::is_xpu_place(place) || - op->attribute("persist_output").data() == false) && - op->attribute("set_constant").data() == false && - op->attribute("copy_data").data() == false) { - return OpFuncType::kGpuSync; - } - - // for memcpy explicitly called by user - if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) { - return OpFuncType::kGpuSync; - } - - if (op_name == "shape") { - return OpFuncType::kGpuSync; - } - return OpFuncType::kGpuAsync; -} + const int stream_priority); +OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index a41455af1d2258..c82e3cbc28b47b 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1619,7 +1619,7 @@ void NewIRInterpreter::BuildInstruction() { if (op_name == "builtin.combine" || op_name == "pd.feed" || op_name == "builtin.set_parameter" || op_name == "builtin.get_parameter" || op_name == "builtin.slice" || - op_name == "pd.data" || op_name == "pd.shaddow_output") { + op_name == "pd.data" || op_name == "pd.shadow_output") { VLOG(6) << "skip process " << op_name; continue; } diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index ac60a546f983ec..87b48c4c81ffb6 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -90,12 +90,21 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place); + std::cerr << "print" << std::endl; + base_program->Print(std::cout); + kernel_program->Print(std::cout); interpretercores_.emplace_back( std::make_shared(place_, fetch_var_names_, std::move(kernel_program), scope_, execution_config)); + std::stringstream pre_ss; + pre_ss << interpretercores_.back()->Impl(); + + for (size_t i = 0; i < fetch_var_names_.size(); ++i) { + fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i]; + } } else { interpretercores_.emplace_back( std::make_shared(place_, @@ -160,6 +169,7 @@ paddle::framework::FetchList StandaloneExecutor::Run( if (FLAGS_enable_new_ir_in_executor) { framework::FetchList fetch_res; + std::cerr << "before fetch " << std::endl; for (auto& var_name : fetch_var_names_) { auto* var = scope_->FindVar(var_name); fetch_res.push_back(var->Get()); diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc index e53e1058a84281..ccbe7316b530ea 100644 --- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc +++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc @@ -59,6 +59,10 @@ const std::unordered_set UnchangeOutputOps = { "builtin.get_parameter", "pd.shadow_output"}; +const std::unordered_set LegacyOpList = { + "pd.fused_softmax_mask_upper_triangle", + "pd.fused_softmax_mask_upper_triangle_grad"}; + bool NeedFallBackCpu(const ir::Operation* op, const std::string& kernel_fn_name, const phi::KernelKey& kernel_key) { @@ -401,7 +405,8 @@ std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, kernel_fn_str, kernel_key); auto args_def = phi_kernel.args_def(); auto output_defs = args_def.output_defs(); - if (!UnchangeOutputOps.count(op_item->name())) { + if (!UnchangeOutputOps.count(op_item->name()) && + !LegacyOpList.count(op_item->name())) { PADDLE_ENFORCE_EQ( op_item->num_results(), output_defs.size(), @@ -413,7 +418,7 @@ std::unique_ptr PdOpLowerToKernelPass(ir::Program* prog, for (size_t i = 0; i < op_item->num_results(); ++i) { phi::Place out_place; if ((!UnchangeOutputOps.count(op_item->name())) && - phi_kernel.IsValid()) { + (!LegacyOpList.count(op_item->name())) && phi_kernel.IsValid()) { out_place = phi::TransToPhiPlace(output_defs[i].backend); } else { out_place = phi::TransToPhiPlace(kernel_key.backend()); From 79a57bfea913aa9678416f2ecdd95dd0594ca05c Mon Sep 17 00:00:00 2001 From: phlrain Date: Sun, 6 Aug 2023 03:28:09 +0000 Subject: [PATCH 15/22] fix --- .../new_executor/standalone_executor.cc | 13 +- .../ir/phi_kernel_adaptor/phi_kernel_util.cc | 2 +- .../ir/transforms/constant_folding_pass.cc | 1 + .../fused_softmax_mask_upper_triangle_op.cu | 3 - test/legacy_test/eager_op_test.py | 1 - test/legacy_test/test_channel_shuffle.py | 471 +++++++++--------- 6 files changed, 237 insertions(+), 254 deletions(-) diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc index 87b48c4c81ffb6..0e1d2de6bed29c 100644 --- a/paddle/fluid/framework/new_executor/standalone_executor.cc +++ b/paddle/fluid/framework/new_executor/standalone_executor.cc @@ -90,21 +90,12 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place, auto kernel_program = paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place); - std::cerr << "print" << std::endl; - base_program->Print(std::cout); - kernel_program->Print(std::cout); interpretercores_.emplace_back( std::make_shared(place_, fetch_var_names_, std::move(kernel_program), scope_, execution_config)); - std::stringstream pre_ss; - pre_ss << interpretercores_.back()->Impl(); - - for (size_t i = 0; i < fetch_var_names_.size(); ++i) { - fetch_var_names_[i] = pre_ss.str() + "_" + fetch_var_names_[i]; - } } else { interpretercores_.emplace_back( std::make_shared(place_, @@ -162,14 +153,12 @@ paddle::framework::FetchList StandaloneExecutor::Run( interpretercores_[job_idx]->ShareBuildResultsFrom( interpretercores_[type_to_first_id[job_type]]); } - interpretercores_[job_idx]->BetaRun(feed_names, /*need_fetch = */ false); + interpretercores_[job_idx]->Run(feed_names, /*need_fetch = */ false); } // return Fetch Tensors if (FLAGS_enable_new_ir_in_executor) { framework::FetchList fetch_res; - - std::cerr << "before fetch " << std::endl; for (auto& var_name : fetch_var_names_) { auto* var = scope_->FindVar(var_name); fetch_res.push_back(var->Get()); diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc index 3728cd48ea1f10..02eafff2b83331 100644 --- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc +++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc @@ -230,7 +230,7 @@ void HandleForSpecialOp( auto fetch_src_name = op->attributes().at("name").dyn_cast().AsString(); - auto fetch_var_name = var_name_prefix + "_" + fetch_src_name + "@fetch"; + auto fetch_var_name = fetch_src_name + "@fetch"; auto* var = const_cast(inner_scope->root()) ->Var(fetch_var_name); var->GetMutable(); diff --git a/paddle/fluid/ir/transforms/constant_folding_pass.cc b/paddle/fluid/ir/transforms/constant_folding_pass.cc index 0465a189d6f2e2..cebcf5d02f7701 100644 --- a/paddle/fluid/ir/transforms/constant_folding_pass.cc +++ b/paddle/fluid/ir/transforms/constant_folding_pass.cc @@ -100,6 +100,7 @@ class ConstantFoldingPattern : public ir::RewritePattern { paddle::dialect::PdOpLowerToKernelPass(temp_program.get()), &scope_, exe_config); + paddle::framework::FetchList fetch_list = core.Run({}); // TODO(liuyuanle): Support multiple output. diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu index c098a11537a07a..779ee234071af0 100644 --- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu +++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu @@ -475,8 +475,6 @@ template class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - std::cerr << "comute grad " << std::endl; - auto* grad_x = context.Output(framework::GradVarName("X")); auto* grad_y = @@ -487,7 +485,6 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel { auto* grad_y_data = grad_y->data(); auto* softmax_rst_data = softmax_rst->data(); - std::cerr << "grad x" << grad_x->dims() << std::endl; auto y_dim = grad_y->dims(); auto batches = y_dim[0]; auto attn_heads = y_dim[1]; diff --git a/test/legacy_test/eager_op_test.py b/test/legacy_test/eager_op_test.py index 485416a3f04af3..a68f81b2f25a41 100644 --- a/test/legacy_test/eager_op_test.py +++ b/test/legacy_test/eager_op_test.py @@ -3038,7 +3038,6 @@ def _get_gradient( compiled_prog = fluid.CompiledProgram(prog, build_strategy) prog = compiled_prog executor = fluid.Executor(place) - print(prog) res = list( map( np.array, diff --git a/test/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py index f8b6ef1df9514b..90843648120324 100644 --- a/test/legacy_test/test_channel_shuffle.py +++ b/test/legacy_test/test_channel_shuffle.py @@ -15,12 +15,9 @@ import unittest import numpy as np -from eager_op_test import OpTest, convert_float_to_uint16 +from eager_op_test import OpTest import paddle -import paddle.nn.functional as F -from paddle import fluid -from paddle.fluid import core def channel_shuffle_np(x, groups, data_format="NCHW"): @@ -70,253 +67,253 @@ def init_dtype(self): def init_data_format(self): self.format = "NCHW" - def test_check_output(self): - self.check_output() + # def test_check_output(self): + # self.check_output() def test_check_grad(self): self.check_grad(['X'], 'Out') -class TestChannelLast(TestChannelShuffleOp): - def init_data_format(self): - self.format = "NHWC" - - -class TestChannelShuffleAPI(unittest.TestCase): - def setUp(self): - self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64") - self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64") - self.out_1_np = channel_shuffle_np(self.x_1_np, 3) - self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") - - def test_static_graph_functional(self): - for use_cuda in ( - [False, True] if core.is_compiled_with_cuda() else [False] - ): - place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - - paddle.enable_static() - x_1 = paddle.static.data( - name="x", shape=[2, 9, 4, 4], dtype="float64" - ) - x_2 = paddle.static.data( - name="x2", shape=[2, 4, 4, 9], dtype="float64" - ) - out_1 = F.channel_shuffle(x_1, 3) - out_2 = F.channel_shuffle(x_2, 3, "NHWC") - - exe = paddle.static.Executor(place=place) - res_1 = exe.run( - fluid.default_main_program(), - feed={"x": self.x_1_np}, - fetch_list=out_1, - use_prune=True, - ) - - res_2 = exe.run( - fluid.default_main_program(), - feed={"x2": self.x_2_np}, - fetch_list=out_2, - use_prune=True, - ) - - np.testing.assert_allclose(res_1[0], self.out_1_np) - np.testing.assert_allclose(res_2[0], self.out_2_np) - - # same test between layer and functional in this op. - def test_static_graph_layer(self): - for use_cuda in ( - [False, True] if core.is_compiled_with_cuda() else [False] - ): - place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - - paddle.enable_static() - x_1 = paddle.static.data( - name="x", shape=[2, 9, 4, 4], dtype="float64" - ) - x_2 = paddle.static.data( - name="x2", shape=[2, 4, 4, 9], dtype="float64" - ) - # init instance - ps_1 = paddle.nn.ChannelShuffle(3) - ps_2 = paddle.nn.ChannelShuffle(3, "NHWC") - out_1 = ps_1(x_1) - out_2 = ps_2(x_2) - out_1_np = channel_shuffle_np(self.x_1_np, 3) - out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") - - exe = paddle.static.Executor(place=place) - res_1 = exe.run( - fluid.default_main_program(), - feed={"x": self.x_1_np}, - fetch_list=out_1, - use_prune=True, - ) - - res_2 = exe.run( - fluid.default_main_program(), - feed={"x2": self.x_2_np}, - fetch_list=out_2, - use_prune=True, - ) - - np.testing.assert_allclose(res_1[0], out_1_np) - np.testing.assert_allclose(res_2[0], out_2_np) - - def run_dygraph(self, groups, data_format): - n, c, h, w = 2, 9, 4, 4 - - if data_format == "NCHW": - shape = [n, c, h, w] - if data_format == "NHWC": - shape = [n, h, w, c] - - x = np.random.random(shape).astype("float64") - - npresult = channel_shuffle_np(x, groups, data_format) - - for use_cuda in ( - [False, True] if core.is_compiled_with_cuda() else [False] - ): - place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - - paddle.disable_static(place=place) - - channel_shuffle = paddle.nn.ChannelShuffle( - groups, data_format=data_format - ) - result = channel_shuffle(paddle.to_tensor(x)) - - np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05) - - result_functional = F.channel_shuffle( - paddle.to_tensor(x), 3, data_format - ) - np.testing.assert_allclose( - result_functional.numpy(), npresult, rtol=1e-05 - ) - - channel_shuffle_str = f'groups={groups}' - if data_format != 'NCHW': - channel_shuffle_str += f', data_format={data_format}' - self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str) - - def test_dygraph1(self): - self.run_dygraph(3, "NCHW") - - def test_dygraph2(self): - self.run_dygraph(3, "NHWC") - - -class TestChannelShuffleError(unittest.TestCase): - def test_error_functional(self): - def error_input(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([9, 4, 4]).astype("float64") - channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3) - - self.assertRaises(ValueError, error_input) - - def error_groups_1(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([2, 9, 4, 4]).astype("float64") - channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33) - - self.assertRaises(TypeError, error_groups_1) - - def error_groups_2(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([2, 9, 4, 4]).astype("float64") - channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1) - - self.assertRaises(ValueError, error_groups_2) - - def error_data_format(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([2, 9, 4, 4]).astype("float64") - channel_shuffle = F.channel_shuffle( - paddle.to_tensor(x), 3, "WOW" - ) +# class TestChannelLast(TestChannelShuffleOp): +# def init_data_format(self): +# self.format = "NHWC" + + +# class TestChannelShuffleAPI(unittest.TestCase): +# def setUp(self): +# self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64") +# self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64") +# self.out_1_np = channel_shuffle_np(self.x_1_np, 3) +# self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") + +# def test_static_graph_functional(self): +# for use_cuda in ( +# [False, True] if core.is_compiled_with_cuda() else [False] +# ): +# place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + +# paddle.enable_static() +# x_1 = paddle.static.data( +# name="x", shape=[2, 9, 4, 4], dtype="float64" +# ) +# x_2 = paddle.static.data( +# name="x2", shape=[2, 4, 4, 9], dtype="float64" +# ) +# out_1 = F.channel_shuffle(x_1, 3) +# out_2 = F.channel_shuffle(x_2, 3, "NHWC") + +# exe = paddle.static.Executor(place=place) +# res_1 = exe.run( +# fluid.default_main_program(), +# feed={"x": self.x_1_np}, +# fetch_list=out_1, +# use_prune=True, +# ) + +# res_2 = exe.run( +# fluid.default_main_program(), +# feed={"x2": self.x_2_np}, +# fetch_list=out_2, +# use_prune=True, +# ) + +# np.testing.assert_allclose(res_1[0], self.out_1_np) +# np.testing.assert_allclose(res_2[0], self.out_2_np) + +# # same test between layer and functional in this op. +# def test_static_graph_layer(self): +# for use_cuda in ( +# [False, True] if core.is_compiled_with_cuda() else [False] +# ): +# place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + +# paddle.enable_static() +# x_1 = paddle.static.data( +# name="x", shape=[2, 9, 4, 4], dtype="float64" +# ) +# x_2 = paddle.static.data( +# name="x2", shape=[2, 4, 4, 9], dtype="float64" +# ) +# # init instance +# ps_1 = paddle.nn.ChannelShuffle(3) +# ps_2 = paddle.nn.ChannelShuffle(3, "NHWC") +# out_1 = ps_1(x_1) +# out_2 = ps_2(x_2) +# out_1_np = channel_shuffle_np(self.x_1_np, 3) +# out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") + +# exe = paddle.static.Executor(place=place) +# res_1 = exe.run( +# fluid.default_main_program(), +# feed={"x": self.x_1_np}, +# fetch_list=out_1, +# use_prune=True, +# ) + +# res_2 = exe.run( +# fluid.default_main_program(), +# feed={"x2": self.x_2_np}, +# fetch_list=out_2, +# use_prune=True, +# ) + +# np.testing.assert_allclose(res_1[0], out_1_np) +# np.testing.assert_allclose(res_2[0], out_2_np) + +# def run_dygraph(self, groups, data_format): +# n, c, h, w = 2, 9, 4, 4 + +# if data_format == "NCHW": +# shape = [n, c, h, w] +# if data_format == "NHWC": +# shape = [n, h, w, c] + +# x = np.random.random(shape).astype("float64") + +# npresult = channel_shuffle_np(x, groups, data_format) + +# for use_cuda in ( +# [False, True] if core.is_compiled_with_cuda() else [False] +# ): +# place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + +# paddle.disable_static(place=place) + +# channel_shuffle = paddle.nn.ChannelShuffle( +# groups, data_format=data_format +# ) +# result = channel_shuffle(paddle.to_tensor(x)) + +# np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05) + +# result_functional = F.channel_shuffle( +# paddle.to_tensor(x), 3, data_format +# ) +# np.testing.assert_allclose( +# result_functional.numpy(), npresult, rtol=1e-05 +# ) + +# channel_shuffle_str = f'groups={groups}' +# if data_format != 'NCHW': +# channel_shuffle_str += f', data_format={data_format}' +# self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str) + +# def test_dygraph1(self): +# self.run_dygraph(3, "NCHW") + +# def test_dygraph2(self): +# self.run_dygraph(3, "NHWC") + + +# class TestChannelShuffleError(unittest.TestCase): +# def test_error_functional(self): +# def error_input(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([9, 4, 4]).astype("float64") +# channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3) + +# self.assertRaises(ValueError, error_input) + +# def error_groups_1(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([2, 9, 4, 4]).astype("float64") +# channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33) + +# self.assertRaises(TypeError, error_groups_1) + +# def error_groups_2(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([2, 9, 4, 4]).astype("float64") +# channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1) + +# self.assertRaises(ValueError, error_groups_2) + +# def error_data_format(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([2, 9, 4, 4]).astype("float64") +# channel_shuffle = F.channel_shuffle( +# paddle.to_tensor(x), 3, "WOW" +# ) + +# self.assertRaises(ValueError, error_data_format) + +# def test_error_layer(self): +# def error_input_layer(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([9, 4, 4]).astype("float64") +# cs = paddle.nn.ChannelShuffle(3) +# cs(paddle.to_tensor(x)) + +# self.assertRaises(ValueError, error_input_layer) + +# def error_groups_layer_1(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([2, 9, 4, 4]).astype("float64") +# cs = paddle.nn.ChannelShuffle(3.33) + +# self.assertRaises(TypeError, error_groups_layer_1) - self.assertRaises(ValueError, error_data_format) +# def error_groups_layer_2(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([2, 9, 4, 4]).astype("float64") +# cs = paddle.nn.ChannelShuffle(-1) - def test_error_layer(self): - def error_input_layer(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([9, 4, 4]).astype("float64") - cs = paddle.nn.ChannelShuffle(3) - cs(paddle.to_tensor(x)) +# self.assertRaises(ValueError, error_groups_layer_2) - self.assertRaises(ValueError, error_input_layer) +# def error_data_format_layer(): +# with paddle.fluid.dygraph.guard(): +# x = np.random.random([2, 9, 4, 4]).astype("float64") +# cs = paddle.nn.ChannelShuffle(3, "MEOW") - def error_groups_layer_1(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([2, 9, 4, 4]).astype("float64") - cs = paddle.nn.ChannelShuffle(3.33) +# self.assertRaises(ValueError, error_data_format_layer) - self.assertRaises(TypeError, error_groups_layer_1) - def error_groups_layer_2(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([2, 9, 4, 4]).astype("float64") - cs = paddle.nn.ChannelShuffle(-1) +# class TestChannelShuffleFP16OP(TestChannelShuffleOp): +# def init_dtype(self): +# self.dtype = np.float16 - self.assertRaises(ValueError, error_groups_layer_2) - def error_data_format_layer(): - with paddle.fluid.dygraph.guard(): - x = np.random.random([2, 9, 4, 4]).astype("float64") - cs = paddle.nn.ChannelShuffle(3, "MEOW") +# @unittest.skipIf( +# not core.is_compiled_with_cuda() +# or not core.is_bfloat16_supported(core.CUDAPlace(0)), +# "core is not complied with CUDA and not support the bfloat16", +# ) +# class TestChannelShuffleBF16OP(OpTest): +# def setUp(self): +# self.op_type = "channel_shuffle" +# self.init_data_format() +# n, c, h, w = 2, 9, 4, 4 +# self.python_api = paddle.nn.functional.channel_shuffle +# self.dtype = np.uint16 +# self.use_mkldnn = False + +# if self.format == "NCHW": +# shape = [n, c, h, w] +# if self.format == "NHWC": +# shape = [n, h, w, c] + +# groups = 3 + +# x = np.random.random(shape).astype('float32') +# out = channel_shuffle_np(x, groups, self.format) +# self.inputs = {'X': convert_float_to_uint16(x)} +# self.attrs = {'groups': groups, "data_format": self.format} +# self.outputs = {'Out': convert_float_to_uint16(out)} - self.assertRaises(ValueError, error_data_format_layer) - - -class TestChannelShuffleFP16OP(TestChannelShuffleOp): - def init_dtype(self): - self.dtype = np.float16 - - -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_bfloat16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the bfloat16", -) -class TestChannelShuffleBF16OP(OpTest): - def setUp(self): - self.op_type = "channel_shuffle" - self.init_data_format() - n, c, h, w = 2, 9, 4, 4 - self.python_api = paddle.nn.functional.channel_shuffle - self.dtype = np.uint16 - self.use_mkldnn = False - - if self.format == "NCHW": - shape = [n, c, h, w] - if self.format == "NHWC": - shape = [n, h, w, c] - - groups = 3 - - x = np.random.random(shape).astype('float32') - out = channel_shuffle_np(x, groups, self.format) - self.inputs = {'X': convert_float_to_uint16(x)} - self.attrs = {'groups': groups, "data_format": self.format} - self.outputs = {'Out': convert_float_to_uint16(out)} - - def init_data_format(self): - self.format = "NCHW" - - def test_check_output(self): - place = core.CUDAPlace(0) - self.check_output_with_place(place) - - def test_check_grad(self): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - ['X'], - 'Out', - ) +# def init_data_format(self): +# self.format = "NCHW" + +# def test_check_output(self): +# place = core.CUDAPlace(0) +# self.check_output_with_place(place) + +# def test_check_grad(self): +# place = core.CUDAPlace(0) +# self.check_grad_with_place( +# place, +# ['X'], +# 'Out', +# ) if __name__ == '__main__': From 33947b0bbaa55e13e0d54eb4fcbed88a489ddc53 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sun, 6 Aug 2023 03:37:44 +0000 Subject: [PATCH 16/22] revert channel shuffl test --- test/legacy_test/test_channel_shuffle.py | 467 ++++++++++++----------- 1 file changed, 235 insertions(+), 232 deletions(-) diff --git a/test/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py index 90843648120324..8da783e6773a26 100644 --- a/test/legacy_test/test_channel_shuffle.py +++ b/test/legacy_test/test_channel_shuffle.py @@ -15,9 +15,12 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle +import paddle.nn.functional as F +from paddle import fluid +from paddle.fluid import core def channel_shuffle_np(x, groups, data_format="NCHW"): @@ -74,246 +77,246 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') -# class TestChannelLast(TestChannelShuffleOp): -# def init_data_format(self): -# self.format = "NHWC" - - -# class TestChannelShuffleAPI(unittest.TestCase): -# def setUp(self): -# self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64") -# self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64") -# self.out_1_np = channel_shuffle_np(self.x_1_np, 3) -# self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") - -# def test_static_graph_functional(self): -# for use_cuda in ( -# [False, True] if core.is_compiled_with_cuda() else [False] -# ): -# place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - -# paddle.enable_static() -# x_1 = paddle.static.data( -# name="x", shape=[2, 9, 4, 4], dtype="float64" -# ) -# x_2 = paddle.static.data( -# name="x2", shape=[2, 4, 4, 9], dtype="float64" -# ) -# out_1 = F.channel_shuffle(x_1, 3) -# out_2 = F.channel_shuffle(x_2, 3, "NHWC") - -# exe = paddle.static.Executor(place=place) -# res_1 = exe.run( -# fluid.default_main_program(), -# feed={"x": self.x_1_np}, -# fetch_list=out_1, -# use_prune=True, -# ) - -# res_2 = exe.run( -# fluid.default_main_program(), -# feed={"x2": self.x_2_np}, -# fetch_list=out_2, -# use_prune=True, -# ) - -# np.testing.assert_allclose(res_1[0], self.out_1_np) -# np.testing.assert_allclose(res_2[0], self.out_2_np) - -# # same test between layer and functional in this op. -# def test_static_graph_layer(self): -# for use_cuda in ( -# [False, True] if core.is_compiled_with_cuda() else [False] -# ): -# place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - -# paddle.enable_static() -# x_1 = paddle.static.data( -# name="x", shape=[2, 9, 4, 4], dtype="float64" -# ) -# x_2 = paddle.static.data( -# name="x2", shape=[2, 4, 4, 9], dtype="float64" -# ) -# # init instance -# ps_1 = paddle.nn.ChannelShuffle(3) -# ps_2 = paddle.nn.ChannelShuffle(3, "NHWC") -# out_1 = ps_1(x_1) -# out_2 = ps_2(x_2) -# out_1_np = channel_shuffle_np(self.x_1_np, 3) -# out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") - -# exe = paddle.static.Executor(place=place) -# res_1 = exe.run( -# fluid.default_main_program(), -# feed={"x": self.x_1_np}, -# fetch_list=out_1, -# use_prune=True, -# ) - -# res_2 = exe.run( -# fluid.default_main_program(), -# feed={"x2": self.x_2_np}, -# fetch_list=out_2, -# use_prune=True, -# ) - -# np.testing.assert_allclose(res_1[0], out_1_np) -# np.testing.assert_allclose(res_2[0], out_2_np) - -# def run_dygraph(self, groups, data_format): -# n, c, h, w = 2, 9, 4, 4 - -# if data_format == "NCHW": -# shape = [n, c, h, w] -# if data_format == "NHWC": -# shape = [n, h, w, c] - -# x = np.random.random(shape).astype("float64") - -# npresult = channel_shuffle_np(x, groups, data_format) - -# for use_cuda in ( -# [False, True] if core.is_compiled_with_cuda() else [False] -# ): -# place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() - -# paddle.disable_static(place=place) - -# channel_shuffle = paddle.nn.ChannelShuffle( -# groups, data_format=data_format -# ) -# result = channel_shuffle(paddle.to_tensor(x)) - -# np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05) - -# result_functional = F.channel_shuffle( -# paddle.to_tensor(x), 3, data_format -# ) -# np.testing.assert_allclose( -# result_functional.numpy(), npresult, rtol=1e-05 -# ) - -# channel_shuffle_str = f'groups={groups}' -# if data_format != 'NCHW': -# channel_shuffle_str += f', data_format={data_format}' -# self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str) - -# def test_dygraph1(self): -# self.run_dygraph(3, "NCHW") - -# def test_dygraph2(self): -# self.run_dygraph(3, "NHWC") - - -# class TestChannelShuffleError(unittest.TestCase): -# def test_error_functional(self): -# def error_input(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([9, 4, 4]).astype("float64") -# channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3) - -# self.assertRaises(ValueError, error_input) - -# def error_groups_1(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([2, 9, 4, 4]).astype("float64") -# channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33) - -# self.assertRaises(TypeError, error_groups_1) - -# def error_groups_2(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([2, 9, 4, 4]).astype("float64") -# channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1) - -# self.assertRaises(ValueError, error_groups_2) - -# def error_data_format(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([2, 9, 4, 4]).astype("float64") -# channel_shuffle = F.channel_shuffle( -# paddle.to_tensor(x), 3, "WOW" -# ) - -# self.assertRaises(ValueError, error_data_format) - -# def test_error_layer(self): -# def error_input_layer(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([9, 4, 4]).astype("float64") -# cs = paddle.nn.ChannelShuffle(3) -# cs(paddle.to_tensor(x)) - -# self.assertRaises(ValueError, error_input_layer) - -# def error_groups_layer_1(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([2, 9, 4, 4]).astype("float64") -# cs = paddle.nn.ChannelShuffle(3.33) - -# self.assertRaises(TypeError, error_groups_layer_1) +class TestChannelLast(TestChannelShuffleOp): + def init_data_format(self): + self.format = "NHWC" + + +class TestChannelShuffleAPI(unittest.TestCase): + def setUp(self): + self.x_1_np = np.random.random([2, 9, 4, 4]).astype("float64") + self.x_2_np = np.random.random([2, 4, 4, 9]).astype("float64") + self.out_1_np = channel_shuffle_np(self.x_1_np, 3) + self.out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") + + def test_static_graph_functional(self): + for use_cuda in ( + [False, True] if core.is_compiled_with_cuda() else [False] + ): + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + + paddle.enable_static() + x_1 = paddle.static.data( + name="x", shape=[2, 9, 4, 4], dtype="float64" + ) + x_2 = paddle.static.data( + name="x2", shape=[2, 4, 4, 9], dtype="float64" + ) + out_1 = F.channel_shuffle(x_1, 3) + out_2 = F.channel_shuffle(x_2, 3, "NHWC") + + exe = paddle.static.Executor(place=place) + res_1 = exe.run( + fluid.default_main_program(), + feed={"x": self.x_1_np}, + fetch_list=out_1, + use_prune=True, + ) + + res_2 = exe.run( + fluid.default_main_program(), + feed={"x2": self.x_2_np}, + fetch_list=out_2, + use_prune=True, + ) + + np.testing.assert_allclose(res_1[0], self.out_1_np) + np.testing.assert_allclose(res_2[0], self.out_2_np) + + # same test between layer and functional in this op. + def test_static_graph_layer(self): + for use_cuda in ( + [False, True] if core.is_compiled_with_cuda() else [False] + ): + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + + paddle.enable_static() + x_1 = paddle.static.data( + name="x", shape=[2, 9, 4, 4], dtype="float64" + ) + x_2 = paddle.static.data( + name="x2", shape=[2, 4, 4, 9], dtype="float64" + ) + # init instance + ps_1 = paddle.nn.ChannelShuffle(3) + ps_2 = paddle.nn.ChannelShuffle(3, "NHWC") + out_1 = ps_1(x_1) + out_2 = ps_2(x_2) + out_1_np = channel_shuffle_np(self.x_1_np, 3) + out_2_np = channel_shuffle_np(self.x_2_np, 3, "NHWC") + + exe = paddle.static.Executor(place=place) + res_1 = exe.run( + fluid.default_main_program(), + feed={"x": self.x_1_np}, + fetch_list=out_1, + use_prune=True, + ) + + res_2 = exe.run( + fluid.default_main_program(), + feed={"x2": self.x_2_np}, + fetch_list=out_2, + use_prune=True, + ) + + np.testing.assert_allclose(res_1[0], out_1_np) + np.testing.assert_allclose(res_2[0], out_2_np) + + def run_dygraph(self, groups, data_format): + n, c, h, w = 2, 9, 4, 4 + + if data_format == "NCHW": + shape = [n, c, h, w] + if data_format == "NHWC": + shape = [n, h, w, c] + + x = np.random.random(shape).astype("float64") + + npresult = channel_shuffle_np(x, groups, data_format) + + for use_cuda in ( + [False, True] if core.is_compiled_with_cuda() else [False] + ): + place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() + + paddle.disable_static(place=place) + + channel_shuffle = paddle.nn.ChannelShuffle( + groups, data_format=data_format + ) + result = channel_shuffle(paddle.to_tensor(x)) + + np.testing.assert_allclose(result.numpy(), npresult, rtol=1e-05) + + result_functional = F.channel_shuffle( + paddle.to_tensor(x), 3, data_format + ) + np.testing.assert_allclose( + result_functional.numpy(), npresult, rtol=1e-05 + ) + + channel_shuffle_str = f'groups={groups}' + if data_format != 'NCHW': + channel_shuffle_str += f', data_format={data_format}' + self.assertEqual(channel_shuffle.extra_repr(), channel_shuffle_str) + + def test_dygraph1(self): + self.run_dygraph(3, "NCHW") + + def test_dygraph2(self): + self.run_dygraph(3, "NHWC") + + +class TestChannelShuffleError(unittest.TestCase): + def test_error_functional(self): + def error_input(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([9, 4, 4]).astype("float64") + channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3) + + self.assertRaises(ValueError, error_input) + + def error_groups_1(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([2, 9, 4, 4]).astype("float64") + channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), 3.33) + + self.assertRaises(TypeError, error_groups_1) + + def error_groups_2(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([2, 9, 4, 4]).astype("float64") + channel_shuffle = F.channel_shuffle(paddle.to_tensor(x), -1) + + self.assertRaises(ValueError, error_groups_2) + + def error_data_format(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([2, 9, 4, 4]).astype("float64") + channel_shuffle = F.channel_shuffle( + paddle.to_tensor(x), 3, "WOW" + ) -# def error_groups_layer_2(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([2, 9, 4, 4]).astype("float64") -# cs = paddle.nn.ChannelShuffle(-1) + self.assertRaises(ValueError, error_data_format) -# self.assertRaises(ValueError, error_groups_layer_2) + def test_error_layer(self): + def error_input_layer(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([9, 4, 4]).astype("float64") + cs = paddle.nn.ChannelShuffle(3) + cs(paddle.to_tensor(x)) -# def error_data_format_layer(): -# with paddle.fluid.dygraph.guard(): -# x = np.random.random([2, 9, 4, 4]).astype("float64") -# cs = paddle.nn.ChannelShuffle(3, "MEOW") + self.assertRaises(ValueError, error_input_layer) -# self.assertRaises(ValueError, error_data_format_layer) + def error_groups_layer_1(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([2, 9, 4, 4]).astype("float64") + cs = paddle.nn.ChannelShuffle(3.33) + self.assertRaises(TypeError, error_groups_layer_1) -# class TestChannelShuffleFP16OP(TestChannelShuffleOp): -# def init_dtype(self): -# self.dtype = np.float16 + def error_groups_layer_2(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([2, 9, 4, 4]).astype("float64") + cs = paddle.nn.ChannelShuffle(-1) + self.assertRaises(ValueError, error_groups_layer_2) -# @unittest.skipIf( -# not core.is_compiled_with_cuda() -# or not core.is_bfloat16_supported(core.CUDAPlace(0)), -# "core is not complied with CUDA and not support the bfloat16", -# ) -# class TestChannelShuffleBF16OP(OpTest): -# def setUp(self): -# self.op_type = "channel_shuffle" -# self.init_data_format() -# n, c, h, w = 2, 9, 4, 4 -# self.python_api = paddle.nn.functional.channel_shuffle -# self.dtype = np.uint16 -# self.use_mkldnn = False - -# if self.format == "NCHW": -# shape = [n, c, h, w] -# if self.format == "NHWC": -# shape = [n, h, w, c] - -# groups = 3 - -# x = np.random.random(shape).astype('float32') -# out = channel_shuffle_np(x, groups, self.format) -# self.inputs = {'X': convert_float_to_uint16(x)} -# self.attrs = {'groups': groups, "data_format": self.format} -# self.outputs = {'Out': convert_float_to_uint16(out)} + def error_data_format_layer(): + with paddle.fluid.dygraph.guard(): + x = np.random.random([2, 9, 4, 4]).astype("float64") + cs = paddle.nn.ChannelShuffle(3, "MEOW") -# def init_data_format(self): -# self.format = "NCHW" - -# def test_check_output(self): -# place = core.CUDAPlace(0) -# self.check_output_with_place(place) - -# def test_check_grad(self): -# place = core.CUDAPlace(0) -# self.check_grad_with_place( -# place, -# ['X'], -# 'Out', -# ) + self.assertRaises(ValueError, error_data_format_layer) + + +class TestChannelShuffleFP16OP(TestChannelShuffleOp): + def init_dtype(self): + self.dtype = np.float16 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestChannelShuffleBF16OP(OpTest): + def setUp(self): + self.op_type = "channel_shuffle" + self.init_data_format() + n, c, h, w = 2, 9, 4, 4 + self.python_api = paddle.nn.functional.channel_shuffle + self.dtype = np.uint16 + self.use_mkldnn = False + + if self.format == "NCHW": + shape = [n, c, h, w] + if self.format == "NHWC": + shape = [n, h, w, c] + + groups = 3 + + x = np.random.random(shape).astype('float32') + out = channel_shuffle_np(x, groups, self.format) + self.inputs = {'X': convert_float_to_uint16(x)} + self.attrs = {'groups': groups, "data_format": self.format} + self.outputs = {'Out': convert_float_to_uint16(out)} + + def init_data_format(self): + self.format = "NCHW" + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['X'], + 'Out', + ) if __name__ == '__main__': From 14c4e9bf3d50e94926955d3ac4f93eb81751bb13 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sun, 6 Aug 2023 09:06:33 +0000 Subject: [PATCH 17/22] polish code --- .../instruction/instruction_base.cc | 34 +++++++++++-------- .../instruction/instruction_base.h | 3 -- .../instruction/phi_kernel_instruction.h | 2 ++ 3 files changed, 21 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc index 5fd12551ff176c..fb9dc40ed57c9d 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.cc +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.cc @@ -106,6 +106,9 @@ void InstructionBase::InitInputsOutputsIds( const std::map& var_name_2_id, const std::unordered_map& variable_2_var_name) { + auto op_attributes = op->attributes(); + auto op_name = + op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString(); std::unordered_map> inputs; for (size_t i = 0; i < op->num_operands(); i++) { ir::Value value = op->operand_source(i); @@ -116,7 +119,7 @@ void InstructionBase::InitInputsOutputsIds( phi::errors::PreconditionNotMet( "input should in name map, [%d] 'th input of [%s] op", i, - phi_op_name_)); + op_name)); std::vector inputs_id = GetValueIds(value, inner_scope, value_2_var_name, @@ -130,22 +133,23 @@ void InstructionBase::InitInputsOutputsIds( std::unordered_map> outputs; for (size_t i = 0; i < op->num_results(); i++) { ir::Value value = op->result(i); - if (value) { - PADDLE_ENFORCE_NE( - value_2_var_name.find(value), - value_2_var_name.end(), - phi::errors::PreconditionNotMet( - "input should in name map, [%d] 'th input of [%s] op", - i, - phi_op_name_)); - std::vector outputs_id = GetValueIds(value, - inner_scope, - value_2_var_name, - var_name_2_id, - variable_2_var_name); - outputs.emplace(value, outputs_id); + if ((!value) || (!(value.type()))) { + continue; } + + PADDLE_ENFORCE_NE( + value_2_var_name.find(value), + value_2_var_name.end(), + phi::errors::PreconditionNotMet( + "input should in name map, [%d] 'th input of [%s] op", i, op_name)); + std::vector outputs_id = GetValueIds(value, + inner_scope, + value_2_var_name, + var_name_2_id, + variable_2_var_name); + outputs.emplace(value, outputs_id); } + SetOutputs(outputs); VLOG(8) << "finish process outputs_index"; } diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index 5ce2358a7df799..c8ca7d8c6158ce 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -183,9 +183,6 @@ class InstructionBase { std::unordered_map<::ir::Value, std::vector> output_index_; std::unordered_set<::ir::Value> no_need_buffer_values_; - - protected: - std::string phi_op_name_; }; } // namespace framework diff --git a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h index 1d7e887e3bd93d..c637cce8651fbf 100644 --- a/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h @@ -63,6 +63,8 @@ class PhiKernelInstruction : public InstructionBase { phi::KernelContext kernel_context_; phi::Kernel* phi_kernel_{nullptr}; // not owned + + std::string phi_op_name_; }; } // namespace framework From 55f630664701c2377f63a6065c069dee38e65602 Mon Sep 17 00:00:00 2001 From: phlrain Date: Sun, 6 Aug 2023 14:51:32 +0000 Subject: [PATCH 18/22] try to fix windows compile error --- .../new_executor/instruction/legacy_kernel_instruction.cc | 7 +++---- .../new_executor/instruction/legacy_kernel_instruction.h | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc index b3d013443055d7..92306779c612e1 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc @@ -138,9 +138,8 @@ LegacyKernelInstruction::LegacyKernelInstruction( local_scope, yaml_info_parser, runtime_context_.get()); - kernel_context_ = std::make_shared( - paddle::framework::ExecutionContext( - *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get()))); + kernel_context_ = new paddle::framework::ExecutionContext( + *operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get())); VLOG(6) << "finish process kernel context"; SetDeviceContext( @@ -169,7 +168,7 @@ LegacyKernelInstruction::LegacyKernelInstruction( void LegacyKernelInstruction::Run() { infer_meta_interface_->infer_meta_(&(infer_meta_context_)); VLOG(6) << "Run op " << legacy_op_name_ << " infer meta."; - (*(phi_kernel_))((kernel_context_.get())); + (*(phi_kernel_))((kernel_context_)); VLOG(6) << "Run op " << legacy_op_name_ << " kernel."; } diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h index a8a150fbb6c776..19ff76e36075fb 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h @@ -62,7 +62,7 @@ class LegacyKernelInstruction : public InstructionBase { std::shared_ptr runtime_context_; std::shared_ptr operator_base_; - std::shared_ptr kernel_context_; + paddle::framework::ExecutionContext* kernel_context_; phi::Kernel* phi_kernel_{nullptr}; // not owned }; From a17ab1443e1d9a949b027b1ce7466e43f3d83567 Mon Sep 17 00:00:00 2001 From: phlrain Date: Mon, 7 Aug 2023 01:47:23 +0000 Subject: [PATCH 19/22] polish code --- .../framework/new_executor/instruction/instruction_util.h | 1 + .../new_executor/instruction/legacy_kernel_instruction.cc | 6 ++++++ .../new_executor/instruction/legacy_kernel_instruction.h | 3 ++- paddle/fluid/framework/new_executor/new_ir_interpreter.cc | 1 - 4 files changed, 9 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_util.h b/paddle/fluid/framework/new_executor/instruction/instruction_util.h index 3d0aa3df9de963..a41ce07957e4ae 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_util.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_util.h @@ -44,5 +44,6 @@ platform::DeviceContext* ParseDeviceContext( const int stream_priority); OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc index 92306779c612e1..eadf0c1f806cf4 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.cc @@ -165,6 +165,12 @@ LegacyKernelInstruction::LegacyKernelInstruction( VLOG(6) << "finish process no need buffer"; } +LegacyKernelInstruction::~LegacyKernelInstruction() { + if (kernel_context_ != nullptr) { + delete kernel_context_; + } +} + void LegacyKernelInstruction::Run() { infer_meta_interface_->infer_meta_(&(infer_meta_context_)); VLOG(6) << "Run op " << legacy_op_name_ << " infer meta."; diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h index 19ff76e36075fb..bade9481c3cb1e 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h @@ -38,6 +38,7 @@ class LegacyKernelInstruction : public InstructionBase { const std::unordered_map& variable_2_var_name); + ~LegacyKernelInstruction(); phi::Kernel* PhiKernel() const { return phi_kernel_; } const phi::InferMetaContext& InferMetaContext() const { @@ -60,9 +61,9 @@ class LegacyKernelInstruction : public InstructionBase { phi::InferMetaContext infer_meta_context_; + paddle::framework::ExecutionContext* kernel_context_{nullptr}; std::shared_ptr runtime_context_; std::shared_ptr operator_base_; - paddle::framework::ExecutionContext* kernel_context_; phi::Kernel* phi_kernel_{nullptr}; // not owned }; diff --git a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc index c82e3cbc28b47b..9def00f22e537b 100644 --- a/paddle/fluid/framework/new_executor/new_ir_interpreter.cc +++ b/paddle/fluid/framework/new_executor/new_ir_interpreter.cc @@ -1626,7 +1626,6 @@ void NewIRInterpreter::BuildInstruction() { if (op_name == "pd.fused_softmax_mask_upper_triangle" || op_name == "pd.fused_softmax_mask_upper_triangle_grad") { - std::cerr << "emplace lagcy kernel " << op_name << std::endl; vec_instruction_base_.emplace_back( std::make_unique(op_idx++, place_, From 1f713ed8b0aa89c13a735a684ccbe202e85bc818 Mon Sep 17 00:00:00 2001 From: phlrain Date: Mon, 7 Aug 2023 07:48:19 +0000 Subject: [PATCH 20/22] update --- .../fluid/framework/new_executor/instruction/instruction_base.h | 1 - .../new_executor/instruction/legacy_kernel_instruction.h | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/new_executor/instruction/instruction_base.h b/paddle/fluid/framework/new_executor/instruction/instruction_base.h index c8ca7d8c6158ce..f078da97107e7e 100644 --- a/paddle/fluid/framework/new_executor/instruction/instruction_base.h +++ b/paddle/fluid/framework/new_executor/instruction/instruction_base.h @@ -21,7 +21,6 @@ #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/platform/event.h" -#include "paddle/ir/core/operation.h" #include "paddle/ir/core/value.h" namespace ir { diff --git a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h index bade9481c3cb1e..27c1cb133bec01 100644 --- a/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h +++ b/paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h @@ -18,12 +18,12 @@ namespace ir { class Operation; +class Value; } // namespace ir namespace paddle { namespace framework { class Scope; -class Value; class LegacyKernelInstruction : public InstructionBase { public: From ca68a2b5412db735a508aa1f515b679d9c7a9f64 Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 8 Aug 2023 01:51:25 +0000 Subject: [PATCH 21/22] update --- third_party/flashattn | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/flashattn b/third_party/flashattn index 18106c1ba0ccee..b5bdb79d5e1f2f 160000 --- a/third_party/flashattn +++ b/third_party/flashattn @@ -1 +1 @@ -Subproject commit 18106c1ba0ccee81b97ca947397c08a141815a47 +Subproject commit b5bdb79d5e1f2f88b1ef62e86899a14f82fa079a From 8ec5558bead4c7738008209edee5346f05081d80 Mon Sep 17 00:00:00 2001 From: phlrain Date: Tue, 8 Aug 2023 01:54:49 +0000 Subject: [PATCH 22/22] revert op test --- test/legacy_test/test_channel_shuffle.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/legacy_test/test_channel_shuffle.py b/test/legacy_test/test_channel_shuffle.py index 8da783e6773a26..f8b6ef1df9514b 100644 --- a/test/legacy_test/test_channel_shuffle.py +++ b/test/legacy_test/test_channel_shuffle.py @@ -70,8 +70,8 @@ def init_dtype(self): def init_data_format(self): self.format = "NCHW" - # def test_check_output(self): - # self.check_output() + def test_check_output(self): + self.check_output() def test_check_grad(self): self.check_grad(['X'], 'Out')