diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h index 8f6e6f4028c1d..2a3304fffe63c 100644 --- a/paddle/fluid/eager/to_static/run_program_op_func.h +++ b/paddle/fluid/eager/to_static/run_program_op_func.h @@ -98,6 +98,7 @@ inline void run_program_ad_func( std::vector& dout, // NOLINT const paddle::framework::AttributeMap& attrs) { // Prepare Autograd Meta + VLOG(2) << "start run run_program ad function."; auto deref_out = details::DereferenceTensors(out); std::vector p_autograd_x = egr::EagerUtils::nullable_autograd_meta(x); @@ -174,3 +175,107 @@ inline void run_program_ad_func( egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node); } } + +inline void newir_run_program_ad_func( + const std::vector& x, + const std::vector& params, + std::vector& out, // NOLINT + std::vector& step_scope, // NOLINT + std::vector& dout, // NOLINT + const paddle::framework::AttributeMap& attrs) { + // Prepare Autograd Meta + VLOG(2) << "start run newir run_program ad function."; + auto deref_out = details::DereferenceTensors(out); + std::vector p_autograd_x = + egr::EagerUtils::nullable_autograd_meta(x); + std::vector p_autograd_params = + egr::EagerUtils::nullable_autograd_meta(params); + std::vector p_autograd_outs = + egr::EagerUtils::nullable_autograd_meta(deref_out); + + bool trace_backward = egr::Controller::Instance().HasGrad(); + bool require_any_grad = egr::EagerUtils::ComputeRequireGrad( + trace_backward, &p_autograd_x, &p_autograd_params); + + // Create Middle Output for GradNode. + auto middle_size = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fm")).size(); + auto output_size = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fo")).size(); + auto middles = std::vector(); + std::shared_ptr grad_node; + VLOG(2) << "start run run_program with require_any_grad = " + << require_any_grad; + + if (require_any_grad) { + // Create GradOpNode (1 means [out_grad], 2 means [x_grad, paramx_grad]) + grad_node = std::make_shared(1, 2); + grad_node->GetMiddle().resize(middle_size); + grad_node->GetOutputs().resize(output_size); + for (size_t i = 0; i < middle_size; ++i) { + grad_node->GetMiddle()[i] = + paddle::Tensor(std::make_shared()); + middles.push_back(&grad_node->GetMiddle()[i]); + } + for (size_t i = 0; i < output_size; ++i) { + grad_node->GetOutputs()[i] = *out[i]; + } + } + + // Call forward function + // if require_any_grad is False, don't save any middle vars. + NewIRRunProgramAPI( + x, params, out, middles, step_scope, dout, require_any_grad, attrs); + if (require_any_grad) { + // auto x_names = + // PADDLE_GET_CONST(std::vector, attrs.at("x_names")); + + egr::EagerUtils::PassStopGradient(false, &p_autograd_outs); + + // Set Attributes + grad_node->SetAttrMap(attrs); + + // auto* forward_global_block = PADDLE_GET_CONST( + // paddle::framework::BlockDesc*, attrs.at("forward_global_block")); + // auto* backward_global_block = PADDLE_GET_CONST( + // paddle::framework::BlockDesc*, attrs.at("backward_global_block")); + // Clear unused x vars + // auto filter_x = + // filter_unused_input_var_in_backward(x, x_names, backward_global_block); + // Set TensorWrappers + grad_node->SetFwdX(x); + // Clear unused out vars + // clear_unused_out_var_in_backward(out, backward_global_block, + // step_scope[0]); + + grad_node->SetFwdParams(params); + grad_node->SetStepScope(step_scope); // just for set useable. + + // Set Grad out rank as same as fwd input and set stop gradient to bwd + // NOTE(@xiongkun): Not every tensor in x(list of tensor) is required + // gradient. for example: x[1] is not used for output, the x[1] is ignored. + + // TODO(@xiongkun): rewrite by new ir representation. + std::vector x_require_grad; + for (size_t i = 0; i < x.size(); ++i) { + x_require_grad.push_back(&x[i]); + } + + grad_node->SetGradOutMeta(x_require_grad, /*slot id*/ 0); + grad_node->SetGradOutMeta(params, /*slot id*/ 1); + + // VLOG(2) << "clear_no_grad_edges."; + // clear_no_grad_edges_with_partial_block(params, + // forward_global_block, + // backward_global_block, + // grad_node.get(), + // [>slot id<] 1); + + grad_node->SetGradInMeta(deref_out, 0); + + egr::EagerUtils::SetOutRankWithSlot(&p_autograd_outs, 0); + + // Set History for output set current Grad Node for + egr::EagerUtils::SetHistory(&p_autograd_outs, grad_node); + } +} diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h index 3b14651e0c775..8f6f8cbbc22fc 100644 --- a/paddle/fluid/eager/to_static/run_program_op_node.h +++ b/paddle/fluid/eager/to_static/run_program_op_node.h @@ -24,6 +24,9 @@ #include "paddle/fluid/pir/transforms/pd_op_to_kernel_pass.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/pir/core/attribute.h" +#include "paddle/pir/core/block.h" +#include "paddle/pir/core/builtin_attribute.h" #include "paddle/pir/core/program.h" #include "paddle/pir/core/value.h" @@ -175,6 +178,33 @@ static void ShareTensorsIntoScopeWithName( } } +static auto GetNameFromValue(const ::pir::Block *block, + const std::vector<::pir::Value> &values) { + // we use name here, later value is used directly. + std::unordered_map<::pir::Value, std::string> value2name; + for (auto *op : *block) { + std::string name; + if (op->name() == "pd_op.data") { + name = + op->attributes().at("name").dyn_cast().AsString(); + value2name[op->results()[0].Value::impl()] = name; + } else if (op->name() == "builtin.set_parameter") { + name = op->attributes() + .at("parameter_name") + .dyn_cast() + .AsString(); + value2name[op->operand(0).source()] = name; + } + } + std::vector names; + std::transform( + values.begin(), + values.end(), + std::back_inserter(names), + [&value2name](const ::pir::Value &v) { return value2name[v]; }); + return names; +} + static void ShareTensorsFromScope( const std::vector &tensors, const paddle::framework::BlockDesc &global_block, @@ -216,6 +246,52 @@ static void ShareTensorsFromScope( } } +static void ShareTensorsIntoScopeByValue( + const ::pir::Block *block, + const std::vector &tensors, + const std::vector<::pir::Value> &values, + paddle::framework::Scope *scope) { + auto names = GetNameFromValue(block, values); + ShareTensorsIntoScopeWithName(tensors, names, scope); +} + +static void ShareTensorsFromScopeByValue( + const ::pir::Block *block, + const std::vector &tensors, + const std::vector<::pir::Value> &values, + paddle::framework::Scope *scope) { + auto names = GetNameFromValue(block, values); + for (size_t i = 0; i < tensors.size(); ++i) { + auto &name = names[i]; + auto &value = values[i]; + if (value.impl() == nullptr) { + // skip stop_gradient. + continue; + } + auto *var = scope->FindVar(name); + PADDLE_ENFORCE_NOT_NULL( + var, + paddle::platform::errors::NotFound("The output tensor %s is not in " + "RunProgram(Grad)Op'" + "s internal scope.", + name)); + CheckOutputVarStatus(*var, *tensors[i]); + // share tensor + if (var->IsType()) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + VLOG(2) << "share " << name << " from scope"; + *dst_tensor = src_tensor; + } else if (var->IsType()) { + auto &src_tensor = var->Get(); + auto *dst_tensor = const_cast( + dynamic_cast(tensors[i]->impl().get())); + *dst_tensor = src_tensor; + } + } +} + static void ShareTensorsFromScopeWithPartialBlock( const std::vector &tensors, const paddle::framework::BlockDesc &forward_global_block, @@ -309,8 +385,194 @@ static void GcScope(paddle::framework::Scope *scope) { delete garbages; // free mem } +template +void print_collection(const T &t) { + VLOG(5) << "Print collection start :"; + for (auto s : t) { + VLOG(5) << s; + } + VLOG(5) << "Print collection end."; +} + } // namespace details +inline void NewIRRunProgramAPI( + const std::vector &x, + const std::vector ¶ms, + std::vector &out, // NOLINT + std::vector &middles, // NOLINT + std::vector &step_scope, // NOLINT + std::vector &dout, // NOLINT + bool require_any_grad, + const paddle::framework::AttributeMap &attrs) { + VLOG(2) << "RunProgramOpKernel Compute"; + // In the original run_program OP, the default value of the is_test + // attribute is false, we should check if there is is_test parameter + // in attrs + auto is_test = false; + if (attrs.count("is_test")) { + is_test = PADDLE_GET_CONST(bool, attrs.at("is_test")); + } + int64_t program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id")); + auto place = egr::Controller::Instance().GetExpectedPlace(); + + // NOTE(chenweihang): In order not to add new variable type, use vector + // here. Originally, here can use scope directly. + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), + 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + + VLOG(2) << "RunProgramOp use interpretercore to execute program."; + + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + + VLOG(4) << "global_inner_scope:" << global_inner_scope; + + auto input_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fx")); + auto output_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fo")); + auto middle_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fm")); + auto param_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fp")); + // auto dout_names = + // PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("fp")); + + auto *forward_global_block = + PADDLE_GET_CONST(::pir::Block *, attrs.at("forward_global_block")); + auto *backward_global_block = + PADDLE_GET_CONST(::pir::Block *, attrs.at("backward_global_block")); + + auto *forward_program = + forward_global_block->GetParentOp()->GetParentProgram(); + auto *backward_program = + backward_global_block->GetParentOp()->GetParentProgram(); + + if (VLOG_IS_ON(4)) { + std::ostringstream print_stream; + forward_program->Print(print_stream); + print_stream << "\n"; + backward_program->Print(print_stream); + VLOG(4) << print_stream.str(); + } + + VLOG(10) << is_test << program_id; + + auto &interpretercore_info_cache = + paddle::framework::InterpreterCoreInfoCache::Instance(); + std::shared_ptr interpreter_core = + nullptr; + if (!interpretercore_info_cache.Has( + program_id, global_inner_scope, /*is_grad=*/false)) { + paddle::platform::RecordEvent record_event( + "create_new_interpretercore", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "No interpretercore cache, so create a new interpretercore " + "for program: " + << program_id; + // Step 1. share input_vars & parameters into scope + details::ShareTensorsIntoScopeByValue( + forward_global_block, x, input_values, global_inner_scope); + details::ShareTensorsIntoScopeByValue( + forward_global_block, params, param_values, global_inner_scope); + // Step 2. create new interpretercore + auto kernel_forward_program = + paddle::dialect::PdOpLowerToKernelPass(forward_program, place); + interpreter_core = paddle::framework::CreateNewIRInterpreterCoreInfoToCache( + std::move(kernel_forward_program), + place, + /*is_grad=*/false, + program_id, + global_inner_scope); + // Step 3. get all eager gc vars + // std::set skip_eager_delete_vars = + // paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet( + // *backward_program); + + // update interpretercore skip_gc_var + auto skip_names = + details::GetNameFromValue(forward_global_block, middle_values); + auto skip_names_set = + std::set(skip_names.begin(), skip_names.end()); + skip_names = details::GetNameFromValue(forward_global_block, output_values); + skip_names_set.insert(skip_names.begin(), skip_names.end()); + details::print_collection(skip_names_set); + interpreter_core->SetSkipGcVars(skip_names_set); + + // std::set input_vars; + // input_vars.insert(input_names.begin(), input_names.end()); + // interpreter_core->SetJitInputVars(input_vars); + + // interpretercore_info_cache.UpdateSkipEagerDeleteVars( + // program_id, global_inner_scope, false, skip_eager_delete_vars); + } else { + paddle::platform::RecordEvent record_event( + "get_interpretercore_cahce", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "Get interpretercore cache by program:" << program_id; + // Step 1. get cache interpretercore + auto &cached_value = interpretercore_info_cache.GetMutable( + program_id, global_inner_scope, /*is_grad=*/false); + interpreter_core = cached_value.core_; + // Step 2. update scope for cache interpretercore + details::ShareTensorsIntoScopeByValue( + forward_global_block, x, input_values, global_inner_scope); + details::ShareTensorsIntoScopeByValue( + forward_global_block, params, param_values, global_inner_scope); + // TODO(xiongkun): new ir how to build scope. + // if (interpreter_core->GetVariableScope()->GetMutableScope() != + // global_inner_scope) { + // details::BuildScopeByBlock( + // *interpreter_core.get(), *forward_global_block, global_inner_scope); + // interpreter_core->reset_scope(global_inner_scope); + //} + } + + // interpretercore run + if (!forward_global_block->empty()) { + paddle::platform::RecordEvent record_event( + "interpreter_core_run", + paddle::platform::TracerEventType::UserDefined, + 1); + interpreter_core->Run({}); + } + + { + paddle::platform::RecordEvent record_event( + "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1); + // Get Output, and Middle Outputs + details::ShareTensorsFromScopeByValue( + forward_global_block, out, output_values, global_inner_scope); + details::ShareTensorsFromScopeByValue( + forward_global_block, middles, middle_values, global_inner_scope); + + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + + if (is_test || !require_any_grad) { + VLOG(4) << "don't require any grad, set this scope can reused"; + VLOG(4) << "is_test: " << is_test + << ", require_any_grad: " << require_any_grad; + global_inner_scope->SetCanReused(true); + details::GcScope(global_inner_scope); + } else { + VLOG(4) << "not test, set this scope can not reused"; + global_inner_scope->SetCanReused(false); + details::GcScope(global_inner_scope); // we can gc all the time, because + // we save the middles. + } + } + +#ifdef PADDLE_WITH_DNNL + if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place); +#endif +} + inline void RunProgramAPI( const std::vector &x, const std::vector ¶ms, @@ -665,12 +927,164 @@ inline void RunProgramGradAPI( } } +inline void NewIRRunProgramGradAPI( + const std::vector &x, + const std::vector ¶ms, + const std::vector &out_grad, + const std::vector &middles, + const std::vector &out, + const std::vector &step_scope, // NOLINT + const paddle::framework::AttributeMap &attrs, + std::vector &x_grad, // NOLINT + std::vector ¶ms_grad // NOLINT +) { + // if all output vars are set to stop_gradient, grad op no need to executed + if (x_grad.empty() && params_grad.empty()) return; + auto *out_scope_vec = &step_scope; + PADDLE_ENFORCE_EQ( + out_scope_vec->size(), + 1, + paddle::platform::errors::InvalidArgument( + "The OutScope of RunProgramGradOp should only hold one scope.")); + paddle::framework::Scope *global_inner_scope = out_scope_vec->front(); + + int64_t program_id = PADDLE_GET_CONST(int64_t, attrs.at("program_id")); + + auto place = egr::Controller::Instance().GetExpectedPlace(); + VLOG(2) << "RunProgramGradOp use interpretercore to execute program."; + + VLOG(4) << "global_inner_scope:" << global_inner_scope; + + auto *backward_global_block = + PADDLE_GET_CONST(::pir::Block *, attrs.at("backward_global_block")); + auto *backward_program = + backward_global_block->GetParentOp()->GetParentProgram(); + + auto output_grad_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bo_g")); + auto forward_input_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bx")); + auto forward_middle_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bm")); + auto forward_output_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bo")); + auto x_grad_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bx_g")); + auto p_grad_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs.at("bp_g")); + + auto &interpretercore_info_cache = + paddle::framework::InterpreterCoreInfoCache::Instance(); + std::shared_ptr interpreter_core = + nullptr; + if (!interpretercore_info_cache.Has( + program_id, global_inner_scope, /*is_grad=*/true)) { + paddle::platform::RecordEvent record_event( + "create_new_interpretercore", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "No interpretercore cahce, so create a new interpretercore"; + // Step 1. share input_vars & parameters into scope + // x, param, middles, output_grads + details::ShareTensorsIntoScopeByValue(backward_global_block, + out_grad, + output_grad_values, + global_inner_scope); + details::ShareTensorsIntoScopeByValue( + backward_global_block, x, forward_input_values, global_inner_scope); + details::ShareTensorsIntoScopeByValue(backward_global_block, + middles, + forward_middle_values, + global_inner_scope); + details::ShareTensorsIntoScopeByValue( + backward_global_block, out, forward_output_values, global_inner_scope); + auto kernel_backward_program = + paddle::dialect::PdOpLowerToKernelPass(backward_program, place); + interpreter_core = paddle::framework::CreateNewIRInterpreterCoreInfoToCache( + std::move(kernel_backward_program), + place, + /*is_grad=*/true, + program_id, + global_inner_scope); + // share threadpool + // NOTE(zhiqiu): this only works interpreter_core is executed strictly + // after the related fwd_interpreter_core. + if (interpretercore_info_cache.Has(program_id, global_inner_scope, false)) { + auto fwd_interpreter_core = + interpretercore_info_cache + .GetMutable(program_id, global_inner_scope, /*is_grad=*/false) + .core_; + interpreter_core->ShareWorkQueueFrom(fwd_interpreter_core); + VLOG(4) << "Share workqueue from " << fwd_interpreter_core.get() << " to " + << interpreter_core.get(); + } + + // get all eager gc vars + std::set skip_eager_delete_vars; + auto skip_names = + details::GetNameFromValue(backward_global_block, x_grad_values); + skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end()); + skip_names = + details::GetNameFromValue(backward_global_block, p_grad_values); + skip_eager_delete_vars.insert(skip_names.begin(), skip_names.end()); + interpreter_core->SetSkipGcVars(skip_eager_delete_vars); + interpretercore_info_cache.UpdateSkipEagerDeleteVars( + program_id, + global_inner_scope, + /*is_grad=*/true, + skip_eager_delete_vars); + VLOG(2) << "Get skip GC vars size is: " << skip_eager_delete_vars.size(); + details::print_collection(skip_eager_delete_vars); + } else { + paddle::platform::RecordEvent record_event( + "get_interpretercore_cahce", + paddle::platform::TracerEventType::UserDefined, + 1); + VLOG(2) << "Get interpretercore cahce by program:" << program_id; + auto &cached_value = interpretercore_info_cache.GetMutable( + program_id, global_inner_scope, /*is_grad=*/true); + interpreter_core = cached_value.core_; + + // update scope (TODO: why share again) + // details::ShareTensorsIntoScope(out_grad, global_inner_scope); + // if (interpreter_core->GetVariableScope()->GetMutableScope() != + // global_inner_scope) { + // details::BuildScopeByBlock( + // *interpreter_core.get(), *backward_global_block, global_inner_scope); + // interpreter_core->reset_scope(global_inner_scope); + //} + } + + if (!backward_global_block->empty()) { + paddle::platform::RecordEvent record_event( + "interpreter_core_run", + paddle::platform::TracerEventType::UserDefined, + 1); + // Debug info: scope info when run end + VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front()); + interpreter_core->Run({}); + } + + { + paddle::platform::RecordEvent record_event( + "fetch_and_gc", paddle::platform::TracerEventType::UserDefined, 1); + // Step 4. get outputs + details::ShareTensorsFromScopeByValue( + backward_global_block, x_grad, x_grad_values, global_inner_scope); + details::ShareTensorsFromScopeByValue( + backward_global_block, params_grad, p_grad_values, global_inner_scope); + VLOG(4) << "after backward gc all vars"; + global_inner_scope->SetCanReused(true); + details::GcScope(global_inner_scope); + } +} + class GradNodeRunProgram : public egr::GradNodeBase { public: GradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} - ~GradNodeRunProgram() { + ~GradNodeRunProgram() override { if (!executed_) { auto *out_scope_vec = &step_scope_; VLOG(4) << "~GradNodeRunProgram"; @@ -833,3 +1247,187 @@ class GradNodeRunProgram : public egr::GradNodeBase { bool executed_{false}; }; + +class NewIRGradNodeRunProgram : public egr::GradNodeBase { + public: + NewIRGradNodeRunProgram(size_t bwd_in_slot_num, size_t bwd_out_slot_num) + : egr::GradNodeBase(bwd_in_slot_num, bwd_out_slot_num) {} + + ~NewIRGradNodeRunProgram() override { + if (!executed_) { + auto *out_scope_vec = &step_scope_; + VLOG(4) << "~GradNodeRunProgram"; + // Normally out_scope_vec.size() == 1. for safty, we add for-loop here. + for (size_t i = 0; i < out_scope_vec->size(); ++i) { + paddle::framework::Scope *global_inner_scope = out_scope_vec->at(i); + global_inner_scope->SetCanReused(true); + details::GcScope(global_inner_scope); + VLOG(4) << "global_inner_scope SetCanReused"; + } + middles_.clear(); + outputs_.clear(); + } + } + // Functor: perform backward computations + virtual paddle::small_vector, + egr::kSlotSmallVectorSize> + operator()(paddle::small_vector, + egr::kSlotSmallVectorSize> &grads, // NOLINT + bool create_graph UNUSED, + bool is_new_grad UNUSED) override { + VLOG(3) << "Running Eager Backward Node: GradNodeRunProgram"; + paddle::small_vector, egr::kSlotSmallVectorSize> + hooked_grads = NewIRGradNodeRunProgram::ApplyGradientHooks(grads); + PADDLE_ENFORCE_EQ(hooked_grads.size(), + 1, + paddle::platform::errors::InvalidArgument( + "The hooked_grads.size() of RunProgramGradOp should " + "be equal to 1.")); + + std::vector x_grad; + std::vector params_grad; + std::vector x_grad_ptr; + std::vector params_grad_ptr; + { + paddle::platform::RecordEvent record_event( + "construct_grad_tensor", + paddle::platform::TracerEventType::UserDefined, + 1); + + egr::EagerUtils::FillZeroForEmptyOptionalGradInput(&hooked_grads[0], + this->InputMeta()[0]); + VLOG(3) << "hooked_grads[0].size() : " << hooked_grads[0].size(); + ConstructXGradTensors(x_, &x_grad); + ConstructParamGradTensors(params_, ¶ms_grad); + for (auto &i : x_grad) { + x_grad_ptr.emplace_back(&i); + } + for (auto &i : params_grad) { + if (i.defined()) { + params_grad_ptr.emplace_back(&i); + } + } + } + + auto out_grad_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bo_g")); + PADDLE_ENFORCE_EQ(hooked_grads[0].size(), + out_grad_values.size(), + paddle::platform::errors::InvalidArgument( + "The hooked_grads[0].size() and " + "out_grad_values.size() should be equal.")); + + VLOG(1) << "Run Program Grad API start."; + NewIRRunProgramGradAPI(x_, + params_, + hooked_grads[0], + middles_, + outputs_, + step_scope_, + attrs_, + x_grad_ptr, + params_grad_ptr); + VLOG(1) << "Run Program Grad API end."; + VLOG(3) << "End Eager Backward Node: GradNodeRunProgram"; + + executed_ = true; + return {x_grad, params_grad}; + } + + void ClearTensorWrappers() override { + x_.clear(); + params_.clear(); + middles_.clear(); + outputs_.clear(); + SetIsTensorWrappersCleared(true); + } + + // SetAttrMap + void SetAttrMap(const paddle::framework::AttributeMap &attrs) { + attrs_ = attrs; + } + + void SetFwdX(const std::vector &tensors) { x_ = tensors; } + + std::vector &GetMiddle() { return middles_; } + + std::vector &GetOutputs() { return outputs_; } + + void SetFwdParams(const std::vector &tensors) { + params_ = tensors; + } + + void SetStepScope(const std::vector &scopes) { + step_scope_ = scopes; + } + + protected: + void ConstructXGradTensors(const std::vector &x, + std::vector *x_grad) { + auto x_grad_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bx_g")); + PADDLE_ENFORCE_EQ( + x.size(), + x_grad_values.size(), + paddle::platform::errors::InvalidArgument( + "The x.size() and x_grad_names.size() should be equal. " + "But received x.size() = %d, x_grad_names.size() = %d", + x.size(), + x_grad_values.size())); + + // TODO(dev): Need an elegant way to determine inforamtion of grad_tensor, + // such as: name, tensor type(DenseTensor or SelectedRows). + for (size_t i = 0; i < x.size(); i++) { + if (x[i].is_dense_tensor()) { + x_grad->emplace_back(std::make_shared()); + } else if (x[i].is_selected_rows()) { + x_grad->emplace_back(std::make_shared()); + } + } + } + + void ConstructParamGradTensors(const std::vector ¶ms, + std::vector *param_grads) { + auto p_grad_values = + PADDLE_GET_CONST(std::vector<::pir::Value>, attrs_.at("bp_g")); + PADDLE_ENFORCE_EQ(params.size(), + p_grad_values.size(), + paddle::platform::errors::InvalidArgument( + "The param.size() and " + "param_grad_names.size() should be equal.")); + + for (size_t i = 0; i < params.size(); ++i) { + auto &p = params[i]; + auto &p_grad = egr::EagerUtils::unsafe_autograd_meta(p)->Grad(); + // In eager mode, the number of param_grad should be the same as + // param, so here an empty Tensor is added for the param with + // stop_gradient=True + if (!p_grad.defined()) { + param_grads->emplace_back(); + } else if (p_grad.is_dense_tensor()) { + param_grads->emplace_back(std::make_shared()); + } else if (p_grad.is_selected_rows()) { + param_grads->emplace_back(std::make_shared()); + } + } + } + + std::shared_ptr Copy() const override { + auto copied_node = std::shared_ptr( + new NewIRGradNodeRunProgram(*this)); + return copied_node; + } + + private: + // TensorWrappers + std::vector x_; + std::vector params_; + std::vector middles_; + std::vector outputs_; + std::vector step_scope_; + + // Attribute Map + paddle::framework::AttributeMap attrs_; + + bool executed_{false}; +}; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index d974076f2e645..526847bb32de5 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -46,7 +46,7 @@ cc_library( cc_library( op_compat_sensible_pass SRCS op_compat_sensible_pass.cc - DEPS graph_pattern_detector op_def_api pass) + DEPS graph_pattern_detector op_def_api pass pir_core) cc_library( subgraph_detector SRCS subgraph_detector.cc diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 17d2bdda56cb9..e0ab584ee3225 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/framework/ir/generate_pass.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/pir/core/block.h" +#include "paddle/pir/core/value.h" #include "paddle/utils/blank.h" namespace paddle { @@ -47,6 +49,12 @@ class element_visitor { int index_; }; +template <> +Attribute element_visitor::operator()( + const std::vector<::pir::Value>& attr UNUSED) const { + PADDLE_THROW(platform::errors::Unimplemented("Unimplemented operand.")); +} + class operation_visitor { public: explicit operation_visitor(const proto::PassDesc::OperationType& type) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index ab74b2691b062..a2eef6417870a 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/fluid/framework/var_type_inference.h" #include "paddle/fluid/operators/ops_extra_info.h" #include "paddle/phi/common/complex.h" +#include "paddle/pir/core/block.h" +#include "paddle/pir/core/value.h" #include "paddle/utils/blank.h" namespace paddle { @@ -964,7 +966,12 @@ struct SetAttrDescVisitor { void operator()(const std::vector &v) const { VectorToRepeated(v, attr_->mutable_bools()); } - + void operator()(const std::vector &v) const { + // just do nothing. + } + void operator()(const std::vector &v) const { + // just do nothing. + } void operator()(const std::vector &v) const { std::vector var_names; for (auto var : v) { diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 961b7c1e663c0..4ad1bcb80c4bc 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/fluid/imperative/type_defs.h" #include "paddle/phi/common/scalar.h" +#include "paddle/pir/core/block.h" +#include "paddle/pir/core/value.h" #include "paddle/utils/blank.h" #include "paddle/utils/small_vector.h" #include "paddle/utils/variant.h" @@ -62,7 +64,9 @@ using Attribute = paddle::variant, double, paddle::experimental::Scalar, - std::vector>; + std::vector, + ::pir::Block*, + std::vector<::pir::Value>>; using AttributeMap = std::unordered_map; using OpCreator = diff --git a/paddle/fluid/pybind/eager_legacy_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h index 1deb20fbf9b88..1c40ce4275c42 100644 --- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h +++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h @@ -21,7 +21,7 @@ namespace paddle { namespace pybind { -static PyObject *eager_api_run_program(PyObject *self, +static PyObject *eager_api_run_program(PyObject *self, // TOREMOVE PyObject *args, PyObject *kwargs) { PyThreadState *tstate = nullptr; @@ -61,11 +61,58 @@ static PyObject *eager_api_run_program(PyObject *self, } } +static PyObject *newir_eager_api_run_program(PyObject *self, + PyObject *args, + PyObject *kwargs) { + PyThreadState *tstate = nullptr; + try { + auto X = GetTensorListFromArgs("run_program", "X", args, 0, true); + auto Params = GetTensorListFromArgs("run_program", "Params", args, 1, true); + auto Out = GetTensorPtrListFromArgs("run_program", "Out", args, 2, true); + auto OutScope = + GetScopePtrListFromArgs("run_program", "OutScope", args, 3, false); + auto DOut = GetTensorPtrListFromArgs("run_program", "DOut", args, 4, true); + framework::AttributeMap attrs; + // TODO(zengjinle): support CUDA Graph on eager mode + VLOG(1) << "Start NewIR ConstructAttrMapFromPyArgs"; + + ConstructAttrMapForRunProgram( + "run_program", args, 6, PyTuple_GET_SIZE(args), attrs); + + VLOG(1) << "Finish NewIR ConstructAttrMapFromPyArgs"; + tstate = PyEval_SaveThread(); + newir_run_program_ad_func(X, Params, Out, OutScope, DOut, attrs); + PyEval_RestoreThread(tstate); + tstate = nullptr; + Py_RETURN_NONE; + } catch (paddle::platform::EnforceNotMet &exception) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + std::ostringstream sout; + sout << exception.what(); + sout << " [operator < run_program > error]"; + exception.set_error_str(sout.str()); + ThrowExceptionToPython(std::current_exception()); + return nullptr; + } catch (...) { + if (tstate) { + PyEval_RestoreThread(tstate); + } + ThrowExceptionToPython(std::current_exception()); + return nullptr; + } +} + static PyMethodDef CustomEagerMethods[] = { {"run_program", (PyCFunction)(void (*)(void))eager_api_run_program, METH_VARARGS | METH_KEYWORDS, "C++ interface function for run_program in dygraph."}, + {"newir_run_program", + (PyCFunction)(void (*)(void))newir_eager_api_run_program, + METH_VARARGS | METH_KEYWORDS, + "C++ interface function for run_program in dygraph."}, {nullptr, nullptr, 0, nullptr}}; } // namespace pybind diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index db93c895eabf3..d6ad957e44959 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -32,6 +32,7 @@ #include "paddle/fluid/pir/dialect/operator/ir/op_dialect.h" #include "paddle/fluid/pir/dialect/operator/ir/op_type.h" #include "paddle/fluid/pir/dialect/operator/ir/pd_api.h" +#include "paddle/fluid/pir/dialect/operator/ir/pd_op.h" #include "paddle/fluid/pir/dialect/operator/utils/utils.h" #include "paddle/fluid/pir/transforms/inplace_pass.h" #include "paddle/phi/core/enforce.h" @@ -142,6 +143,12 @@ void BindProgram(py::module *m) { self->Print(print_stream); return print_stream.str(); }) + .def("__repr__", + [](const std::shared_ptr &self) { + std::ostringstream print_stream; + self->Print(print_stream); + return print_stream.str(); + }) .def("parameters_num", [](const std::shared_ptr &self) { return self->parameters_num(); @@ -459,6 +466,14 @@ void BindOpResult(py::module *m) { .def("has_one_use", &Value::HasOneUse) .def("use_empty", &OpResult::use_empty) .def("type", &OpResult::type) + .def("is_dense_tensor_type", + [](OpResult &self) { + if (self.type().isa()) { + return true; + } else { + return false; + } + }) .def_property( "stop_gradient", [](OpResult &self) { @@ -517,7 +532,324 @@ void BindType(py::module *m) { }); } +Operation *BuildOpFrom( + const Operation *to_copy_op, + std::unordered_map &value_map) { // NOLINT + pir::OperationArgument to_create_argument(to_copy_op->info()); + to_create_argument.attributes = to_copy_op->attributes(); + + auto origin_results = to_copy_op->results(); + std::transform(origin_results.begin(), + origin_results.end(), + std::back_inserter(to_create_argument.output_types), + [](const pir::OpResult &r) { + // OpResult -> OpType + return r.type(); + }); + + // transform by value_map dict. + auto origin_operands = to_copy_op->operands(); + std::transform(origin_operands.begin(), + origin_operands.end(), + std::back_inserter(to_create_argument.inputs), + [&value_map](const pir::OpOperand &operand) { + // Operand -> OpResult + return value_map[operand.source()].impl(); + }); + auto *cloned_op = Operation::Create(std::move(to_create_argument)); + + // update the mapping of value_map. std::transform is a map(func, zip()). + std::vector tmp; + std::transform(origin_results.begin(), + origin_results.end(), + cloned_op->results().begin(), + std::back_inserter(tmp), // NOLINT, just a placeholder. + [&value_map](const OpResult &a, const OpResult &b) { // NOLINT + value_map[a.Value::impl()] = b.Value::impl(); + return 1; + }); + return cloned_op; +} + +std::shared_ptr ProgramClone(const Program &program) { + // Limitation of this function: + // 1. don't support Parameters. + // 2. don't support Regions in operator. + pir::IrContext *ctx = pir::IrContext::Instance(); + auto cloned_program = std::make_shared(ctx); + std::unordered_map value_map; + for (auto &op : *program.block()) { + auto *cloned_op = BuildOpFrom(op, value_map); + cloned_program->block()->push_back(cloned_op); + } + return cloned_program; +} + +std::list::const_iterator list_offset(const Block *block, + int start_idx) { + auto it = block->begin(); + while (start_idx--) ++it; + return it; +} + +template +void range_block_do(const Block *block, std::vector range, F fn) { + for (auto it = list_offset(block, range[0]); + it != list_offset(block, range[1]); + ++it) { + fn(*it); + } +} + +std::vector AnalysisMiddleVariable( + const Program &program, + const std::vector &forward_inputs, + const std::vector &forward_range, + const std::vector &backward_range) { + std::vector middle_values; + + std::unordered_set backward_inputs; + std::unordered_set x_or_param(forward_inputs.begin(), + forward_inputs.end()); + range_block_do( + program.block(), backward_range, [&backward_inputs](Operation *op) { + for (auto &t : op->operands()) { + backward_inputs.insert(t.source()); + } + }); + + range_block_do( + program.block(), + forward_range, + [&middle_values, &backward_inputs, &x_or_param](Operation *op) { + for (auto &t : op->results()) { + auto v = Value(t.Value::impl()); + if (backward_inputs.count(v) && !x_or_param.count(v)) + middle_values.push_back(v); + } + }); + return middle_values; +} + +void mapping_value(const std::vector &origin, + const std::unordered_map &value_map, + std::vector &out) { // NOLINT + std::transform(origin.begin(), + origin.end(), + std::back_inserter(out), + [&value_map](const pir::Value &v) { + if (v.impl() == nullptr) return Value(nullptr); + return value_map.at(v); + }); +} + +using SplitedProgram = std::vector>; +using SplitedAttribute = std::map>; +using SplitedResult = std::pair; + +pir::OpResult FakeOpResult() { + // create a fake opresults to simplify `ForwardBackwardSplit`. + return pir::OpResult(nullptr); +} + +SplitedResult ForwardBackwardSplit( + const Program &program, + const std::vector &op_result_forward_inputs, + const std::vector &op_result_forward_outputs, + const std::vector &op_result_forward_inputs_grads, + const std::vector &op_result_forward_outputs_grads, + const std::vector &forward_range, + const std::vector &backward_range) { + // transform opresult -> value + VLOG(1) << "Start Prepare data structures."; + std::vector forward_inputs, forward_outputs, forward_inputs_grads, + forward_outputs_grads; + + auto op_result_to_value = [](const pir::OpResult &r) { + if (r.impl() == nullptr) return Value(nullptr); + return Value(r.Value::impl()); + }; + + std::transform(op_result_forward_inputs.begin(), + op_result_forward_inputs.end(), + std::back_inserter(forward_inputs), + op_result_to_value); + std::transform(op_result_forward_outputs.begin(), + op_result_forward_outputs.end(), + std::back_inserter(forward_outputs), + op_result_to_value); + std::transform(op_result_forward_inputs_grads.begin(), + op_result_forward_inputs_grads.end(), + std::back_inserter(forward_inputs_grads), + op_result_to_value); + std::transform(op_result_forward_outputs_grads.begin(), + op_result_forward_outputs_grads.end(), + std::back_inserter(forward_outputs_grads), + op_result_to_value); + + std::vector forward_in_out_values; + for (auto &v : std::vector *>( + {&forward_inputs, &forward_outputs})) { + forward_in_out_values.insert( + forward_in_out_values.end(), v->begin(), v->end()); + } + + std::vector fx, fp, fm, fo, bx, bp, bm, bo_g, bx_g, bp_g, bo; + pir::IrContext *ctx = pir::IrContext::Instance(); + auto forward_program = std::make_shared(ctx); + auto backward_program = std::make_shared(ctx); + auto middle_values = AnalysisMiddleVariable( + program, forward_in_out_values, forward_range, backward_range); + std::unordered_map forward_value_map; + std::unordered_map backward_value_map; + pir::Builder backward_builder = pir::Builder(ctx, backward_program->block()); + + // forward program construct. + VLOG(1) << "Before Forward Construct."; + range_block_do(program.block(), + forward_range, + [&forward_value_map, &forward_program](Operation *op) { + auto *cloned_op = BuildOpFrom(op, forward_value_map); + forward_program->block()->push_back(cloned_op); + }); + VLOG(1) << "After Forward Construct."; + + // backward program construc. + // Step1. insert data op for inputs_values and middle_values + int counter = 0; + auto create_data_fn = [&backward_builder, &backward_value_map, &counter]( + const pir::Value &v) { + if (v.impl() == nullptr) { + return; + } + auto value_type = v.type().dyn_cast(); + auto dtype = paddle::dialect::TransToPhiDataType(value_type.dtype()); + auto shape = phi::vectorize(value_type.dims()); + auto place = phi::CPUPlace(); // TODO(xiongkun): how to get default places. + + paddle::dialect::DataOp op = + backward_builder.Build( + std::string("input_") + std::to_string(counter), + shape, + dtype, + place); + counter += 1; + backward_value_map[v] = op->results()[0].Value::impl(); + }; + + auto create_output_fn_forward = [&ctx, + &forward_value_map, + &counter, + &forward_program](const pir::Value &v) { + if (v.impl() == nullptr) { + return; + } + auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name()); + pir::AttributeMap attribute_map = { + {"parameter_name", + pir::StrAttribute::get( + ctx, std::string("output_") + std::to_string(counter))}, + }; + pir::Operation *operation = pir::Operation::Create( + {OpResult(forward_value_map[v].impl())}, attribute_map, {}, op_info); + forward_program->block()->push_back(operation); + counter += 1; + }; + + auto create_output_fn_backward = [&ctx, + &backward_value_map, + &counter, + &backward_program](const pir::Value &v) { + if (v.impl() == nullptr) { + return; + } + auto op_info = ctx->GetRegisteredOpInfo(pir::SetParameterOp::name()); + pir::AttributeMap attribute_map = { + {"parameter_name", + pir::StrAttribute::get( + ctx, std::string("output_") + std::to_string(counter))}, + }; + pir::Operation *operation = + pir::Operation::Create({OpResult(backward_value_map.at(v).impl())}, + attribute_map, + {}, + op_info); + backward_program->block()->push_back(operation); + counter += 1; + }; + + counter = 0; + std::for_each(forward_outputs.begin(), forward_outputs.end(), create_data_fn); + std::for_each(forward_inputs.begin(), forward_inputs.end(), create_data_fn); + std::for_each(middle_values.begin(), middle_values.end(), create_data_fn); + std::for_each(forward_outputs_grads.begin(), + forward_outputs_grads.end(), + create_data_fn); + VLOG(1) << "After create pd.data for backward program."; + + counter = 0; + std::for_each( + middle_values.begin(), middle_values.end(), create_output_fn_forward); + std::for_each( + forward_outputs.begin(), forward_outputs.end(), create_output_fn_forward); + + VLOG(1) << "After call create_output_fn"; + // Step2. copy backward ops . + range_block_do(program.block(), + backward_range, + [&backward_value_map, &backward_program](Operation *op) { + auto *cloned_op = BuildOpFrom(op, backward_value_map); + backward_program->block()->push_back(cloned_op); + }); + VLOG(1) << "After call backward copy"; + counter = 0; + std::for_each(forward_inputs_grads.begin(), + forward_inputs_grads.end(), + create_output_fn_backward); + // TODO(xiongkun): add forward parameter grads. + + VLOG(1) << "forward_value_map.size() is " << forward_value_map.size(); + VLOG(1) << "backward_value_map.size() is " << backward_value_map.size(); + std::ostringstream print_stream; + print_stream << "ForwardProgram is :\n"; + forward_program->Print(print_stream); + print_stream << "BackwardProgram is:\n"; + backward_program->Print(print_stream); + VLOG(1) << "Splited Program (fwd | bwd): \n" << print_stream.str(); + + // construct all attributes we needed. + + mapping_value(middle_values, forward_value_map, fm); // write 'fm' + mapping_value(middle_values, backward_value_map, bm); // write 'bm' + mapping_value(forward_inputs, forward_value_map, fx); // write 'fx' + mapping_value(forward_inputs, backward_value_map, bx); // write 'bx' + mapping_value(forward_outputs, forward_value_map, fo); // write 'fo' + mapping_value( + forward_inputs_grads, backward_value_map, bx_g); // write 'fx_g' + mapping_value( + forward_outputs_grads, backward_value_map, bo_g); // write 'bo_g' + mapping_value(forward_outputs, backward_value_map, bo); // write 'bo' + + std::map> attr = {{"fx", fx}, + {"fp", fp}, + {"fm", fm}, + {"fo", fo}, + {"bx", bx}, + {"bp", bp}, + {"bm", bm}, + {"bo_g", bo_g}, + {"bx_g", bx_g}, + {"bp_g", bp_g}, + {"bo", bo}}; + std::vector> programs = {forward_program, + backward_program}; + return std::make_pair(programs, attr); +} + void BindUtils(pybind11::module *m) { + m->def("program_clone", ProgramClone); + m->def("program_split", ForwardBackwardSplit); + m->def("fake_op_result", FakeOpResult); m->def("set_global_program", [](Program *program) { APIBuilder::Instance().SetProgram(program); }); m->def("set_insertion_point", diff --git a/paddle/fluid/pybind/op_function_common.cc b/paddle/fluid/pybind/op_function_common.cc index 266578615e352..a1e22b94ce192 100644 --- a/paddle/fluid/pybind/op_function_common.cc +++ b/paddle/fluid/pybind/op_function_common.cc @@ -34,6 +34,8 @@ #include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/imperative.h" #include "paddle/phi/common/complex.h" +#include "paddle/pir/core/block.h" +#include "paddle/pir/core/value.h" namespace paddle { namespace pybind { @@ -829,6 +831,54 @@ void CastPyArg2AttrBlock(PyObject* obj, attrs[key] = reinterpret_cast(vh[0]); } +void CastPyArg2AttrIRBlock(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, + const std::string& op_type, + ssize_t arg_pos) { + VLOG(1) << "After Process pir::Block*"; + ::pybind11::detail::instance* inst = + (::pybind11::detail::instance*)obj; // NOLINT + void** vh = inst->simple_layout ? inst->simple_value_holder + : &inst->nonsimple.values_and_holders[0]; + attrs[key] = reinterpret_cast<::pir::Block*&>(vh[0]); +} + +void CastPyArg2AttrValues(PyObject* obj, + paddle::framework::AttributeMap& attrs, // NOLINT + const std::string& key, + const std::string& op_type, + ssize_t arg_pos) { + std::vector<::pir::Value> results; + if (PyList_Check(obj)) { + Py_ssize_t len = PyList_Size(obj); + PyObject* item = nullptr; + for (Py_ssize_t i = 0; i < len; i++) { + // TODO(xiongkun): judge OpResult or Value; + item = PyList_GetItem(obj, i); + ::pybind11::detail::instance* inst = + (::pybind11::detail::instance*)item; // NOLINT + void** vh = inst->simple_layout ? inst->simple_value_holder + : &inst->nonsimple.values_and_holders[0]; + ::pir::OpResult* opresult = reinterpret_cast<::pir::OpResult*>(vh[0]); + if (opresult->impl() == nullptr) { + results.emplace_back(pir::Value(nullptr)); + } else { + results.emplace_back(pir::Value(opresult->Value::impl())); + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be " + "a list of int, float, complex, or bool, but got %s", + op_type, + arg_pos + 1, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + attrs[key] = results; + VLOG(1) << "Pybind: Cast " << results.size() << " Value Finished."; +} + void ConstructAttrMapFromPyArgs( const std::string& op_type, PyObject* args, @@ -847,6 +897,7 @@ void ConstructAttrMapFromPyArgs( PyObject* obj = nullptr; for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) { + VLOG(1) << "Start Process " << arg_pos; Py_ssize_t key_len; const char* key_ptr; obj = PyTuple_GET_ITEM(args, arg_pos); @@ -862,6 +913,7 @@ void ConstructAttrMapFromPyArgs( } std::string key(key_ptr, (size_t)key_len); // NOLINT + VLOG(1) << "Start Process " << key; auto iter = attr_type_map->find(key); if (iter == attr_type_map->end()) { continue; @@ -921,6 +973,77 @@ void ConstructAttrMapFromPyArgs( } } +void ConstructAttrMapForRunProgram( + const std::string& op_type, + PyObject* args, + ssize_t attr_start, + ssize_t attr_end, + paddle::framework::AttributeMap& attrs) { // NOLINT + PADDLE_ENFORCE_EQ((attr_end - attr_start) % 2, + 0, + platform::errors::InvalidArgument( + "The number of arguments for attributes should be even " + "but attr_start = %d, attr_end = %d.", + attr_start, + attr_end)); + + PyObject* obj = nullptr; + for (ssize_t arg_pos = attr_start; arg_pos < attr_end; arg_pos += 2) { + VLOG(1) << "Start Process " << arg_pos; + Py_ssize_t key_len; + const char* key_ptr; + obj = PyTuple_GET_ITEM(args, arg_pos); + if (PyObject_CheckString(obj)) { + key_ptr = PyUnicode_AsUTF8AndSize(obj, &key_len); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument (position %d) must be str, but got " + "%s", + op_type, + arg_pos, + ((PyTypeObject*)obj->ob_type)->tp_name)); // NOLINT + } + + std::string key(key_ptr, (size_t)key_len); // NOLINT + VLOG(1) << "Start Process " << key; + obj = PyTuple_GET_ITEM(args, arg_pos + 1); + + if (std::set({"cuda_graph_capture_mode"}).count(key)) { + CastPyArg2AttrString(obj, attrs, key, op_type, arg_pos); + } else if (std::set({"global_block", + "forward_global_block", + "backward_global_block"}) + .count(key)) { + CastPyArg2AttrIRBlock(obj, attrs, key, op_type, arg_pos); + } else if (std::set({"is_test", "use_interpretorcore"}) + .count(key)) { + CastPyArg2AttrBoolean(obj, attrs, key, op_type, arg_pos); + } else if (std::set({"start_op_index", + "end_op_index", + "program_id", + "cuda_graph_pool_id"}) + .count(key)) { + CastPyArg2AttrLong(obj, attrs, key, op_type, arg_pos); + } else if (std::set({"fx", + "fp", + "fm", + "fo", + "bx", + "bp", + "bm", + "bo_g", + "bx_g", + "bp_g", + "bo"}) + .count(key)) { + CastPyArg2AttrValues(obj, attrs, key, op_type, arg_pos); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s is not defined in this function.", key)); // NOLINT + } + } +} + unsigned long GetUnsignedLongFromArgs( // NOLINT const std::string& op_type, const std::string& arg_name, diff --git a/paddle/fluid/pybind/op_function_common.h b/paddle/fluid/pybind/op_function_common.h index a3f4960bbd58b..2d02dd6fb784d 100644 --- a/paddle/fluid/pybind/op_function_common.h +++ b/paddle/fluid/pybind/op_function_common.h @@ -194,6 +194,13 @@ void ConstructAttrMapFromPyArgs( ssize_t attr_end, paddle::framework::AttributeMap& attrs); // NOLINT +void ConstructAttrMapForRunProgram( + const std::string& op_type, + PyObject* args, + ssize_t attr_start, + ssize_t attr_end, + paddle::framework::AttributeMap& attrs); // NOLINT + unsigned long GetUnsignedLongFromArgs( // NOLINT const std::string& op_type, const std::string& arg_name, diff --git a/python/paddle/jit/dy2static/function_spec.py b/python/paddle/jit/dy2static/function_spec.py index d34a5dc6288f6..e2966e4097d86 100644 --- a/python/paddle/jit/dy2static/function_spec.py +++ b/python/paddle/jit/dy2static/function_spec.py @@ -18,7 +18,9 @@ import numpy as np import paddle +import paddle.ir.core as ir_static from paddle.base import core +from paddle.base.data_feeder import convert_dtype from paddle.base.dygraph.base import switch_to_static_graph from paddle.jit.translated_layer import TranslatedLayer from paddle.nn.layer import layers @@ -170,6 +172,34 @@ def args_to_input_spec(self, args, kwargs): return args_with_spec, kwargs_with_spec + @switch_to_static_graph + def newir_to_static_inputs_with_spec(self, input_with_spec, main_program): + """ + Constructs feed layer by inputs with InputSpec information for main program. + + Args: + input_with_spec(tuple): input arguments by replacing argument with InputSpec. + main_program(Program): main program for inserting feed layer. + """ + flat_input_spec = paddle.utils.flatten(input_with_spec) + + inputs = [] + with ir_static.program_guard(main_program): + for i, var_spec in enumerate(flat_input_spec): + if isinstance(var_spec, paddle.static.InputSpec): + stop_gradient = getattr(var_spec, 'stop_gradient', False) + feed_value = paddle.static.input.data( + name=var_spec.name or "feed_%s" % i, + shape=var_spec.shape, + dtype=convert_dtype(var_spec.dtype), + ) + feed_value.stop_gradient = stop_gradient + else: + feed_value = var_spec + inputs.append(feed_value) + + return paddle.utils.pack_sequence_as(input_with_spec, inputs) + @switch_to_static_graph def to_static_inputs_with_spec(self, input_with_spec, main_program): """ diff --git a/python/paddle/jit/dy2static/newir_partial_program.py b/python/paddle/jit/dy2static/newir_partial_program.py new file mode 100644 index 0000000000000..76954d58c6b09 --- /dev/null +++ b/python/paddle/jit/dy2static/newir_partial_program.py @@ -0,0 +1,1135 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from copy import deepcopy + +import numpy as np + +import paddle +import paddle.ir.core as ir_static +from paddle import _legacy_C_ops +from paddle.amp.auto_cast import _in_amp_guard, _in_pure_fp16_guard +from paddle.autograd.ir_backward import grad +from paddle.base import core, framework, program_guard +from paddle.base.compiler import BuildStrategy +from paddle.base.data_feeder import check_type, convert_dtype +from paddle.base.dygraph.base import switch_to_static_graph +from paddle.base.framework import _apply_pass +from paddle.base.libpaddle.ir import OpResult, fake_op_result +from paddle.optimizer.lr import LRScheduler + +from . import logging_utils +from .utils import RETURN_NO_VALUE_MAGIC_NUM, backend_guard + +__all__ = [] + + +class NestSequence: + """ + A wrapper class that easily to flatten and restore the nest structure of + given sequence. + """ + + def __init__(self, raw_input, need_check=False): + self.__raw_input = raw_input + self.__input_list = self.tolist() + self.__var_ids = self._get_var_ids() + self._check_non_variable(need_check) + + def tolist(self): + """ + Flattens the nested sequences into single list. + """ + return paddle.utils.flatten(self.__raw_input) + + def restore(self, value_list): + """ + Restores the nested sequence from value list. + """ + assert len(self.__input_list) == len(value_list) + return paddle.utils.pack_sequence_as(self.__raw_input, value_list) + + def _get_var_ids(self): + var_ids = [] + for idx, var in enumerate(self.__input_list): + if isinstance(var, (OpResult, core.eager.Tensor)): + var_ids.append(idx) + + return var_ids + + def _check_non_variable(self, need_check): + """ + Raises warning if output of traced function contains non-tensor type values. + """ + if need_check: + warning_types = set() + for var in self.__input_list: + if not isinstance(var, (framework.Variable, core.eager.Tensor)): + warning_types.add(type(var)) + if warning_types: + logging_utils.warn( + "Output of traced function contains non-tensor type values: {}. " + "Currently, We don't support to update them while training and will return " + "what we first saw. Please try to return them as tensor.".format( + list(warning_types) + ) + ) + + @property + def var_ids(self): + return self.__var_ids + + def __getitem__(self, item): + return self.__input_list[item] + + +class LazyInitialized: + """ + Descriptor to implement lazy initialization of property. + """ + + def __init__(self, function): + self.function = function + + def __get__(self, instance, cls): + val = self.function(instance) + setattr(instance, self.function.__name__, val) + return val + + +class ProgramInfo: + """ + A helper class to recoder Program information + """ + + def __init__(self): + self.op_size = { + 'fp32': -1, + 'amp': -1, + 'fp16': -1, + } + self.programs = {} + self.mode = "infer" + + def __call__(self, key, prog_creator): + """ + Recoder infer program and op size. + """ + assert key in ['fp32', 'amp', 'fp16'] + if key not in self.programs: + infer_prog = prog_creator(is_infer_mode=True) + self.programs[key] = infer_prog + self.op_size[key] = infer_prog.desc.block(0).op_size() + + return self.programs[key], self.op_size[key] + + +class PartialProgramLayerHook: + def before_append_backward(self, forward_program): + ... + + def after_append_backward(self, whole_program, backward_start_idx): + ... + + def after_infer(self, infer_program): + ... + + +class PartialProgramLayer: + """ + PartialProgramLayer wraps all the ops from layers decorated by `@to_static` + and execute them as a static subgraph. + + .. note:: + **1. This is a very low level API. Users should not use this API + directly. Please use `partial_program_from(concrete_program)` + to create it. + **2. LoDTensorArray is not currently supported in the output. + + Args: + main_program(Program): The main program that contains ops need to be executed. + inputs(list[Variable]): The input list of the decorated function by `@to_static`. + outputs(list[Variable]): The output list of the decorated function by `@to_static`. + parameters(list[Tensor]|None): All trainable parameters included in the program. Default None. + + Returns: + Layer: A Layer object that run all ops internally in static graph mode. + """ + + def __init__( + self, main_program, inputs, outputs, parameters=None, **kwargs + ): + super().__init__() + self._inputs = NestSequence(inputs) + self._outputs = NestSequence(outputs, need_check=True) + self._params = parameters if parameters is not None else [] + + self._build_strategy = kwargs.get('build_strategy', BuildStrategy()) + assert isinstance(self._build_strategy, BuildStrategy) + + self._origin_main_program = self._verify_program(main_program) + self._cuda_graph_vec = self._create_cuda_graph_vec() + self._cuda_graph_capture_mode = "" + self._cuda_graph_pool_id = 0 + # Set default mode to train + self.training = True + self._infer_info = ProgramInfo() + self._program_extra_info = {} + + amp_dtype, custom_white_list, custom_black_list = None, None, None + tracer = framework._dygraph_tracer() + if tracer: + custom_white_list, custom_black_list = tracer._get_amp_op_list() + amp_dtype = tracer._amp_dtype + if amp_dtype is not None and amp_dtype in ['float16', 'bfloat16']: + # For AMP training + self._amp_list = ( + paddle.static.amp.fp16_lists.AutoMixedPrecisionLists( + custom_white_list=custom_white_list, + custom_black_list=custom_black_list, + dtype=amp_dtype, + ) + ) + + # program_id -> list(scope) + self._scope_cache = {} + self._hooker = None + self._backend = kwargs.get('backend', None) + self._grad_var_names = {} + + def __call__(self, inputs): + """ + Execute static graph by Interpreter and Return dynamic Tensors. + """ + in_vars, out_vars = self._prepare(inputs) + self._cast_fp16_if_pure_fp16(in_vars) + attrs = self._prepare_attributes() + + # self._sync_lr_value_with_scheduler() + + c_run_program_fn = None + if ir_static._use_new_ir_api(): + c_run_program_fn = _legacy_C_ops.newir_run_program + else: + c_run_program_fn = _legacy_C_ops.run_program + c_run_program_fn( + self._valid_vars(in_vars), + self._valid_vars(self._params), + self._valid_vars(out_vars), + self._create_scope_vec( + program_id=self.program_id, use_scope_cache=True + ), + self._double_grads, + self._cuda_graph_vec, + *attrs, + ) + self._update_stop_gradient(out_vars) + restored_nest_out = self._restore_out(out_vars) + return self._remove_no_value(restored_nest_out) + + def _sync_lr_value_with_scheduler(self): + """Update lr_var value with calculated by lr_scheduler.""" + main_program = self._origin_main_program + if hasattr(main_program, 'lr_scheduler') and hasattr( + main_program, 'lr_var' + ): + lr_scheduler = main_program.lr_scheduler + lr_var = main_program.lr_var + + assert isinstance(lr_scheduler, LRScheduler), "must be LRScheduler" + lr_scheduler = self._origin_main_program.lr_scheduler + lr_value = lr_scheduler() + data = np.array(lr_value).astype(convert_dtype(lr_var.dtype)) + lr_var.set_value(data) + + def set_hooker(self, hooker): + self._hooker = hooker + + def _get_scope(self, program_id=None, use_scope_cache=False): + if use_scope_cache: + if program_id not in self._scope_cache: + scope = core.Scope() + self._scope_cache[program_id] = [scope] + return scope + else: + for scope in self._scope_cache[program_id]: + if scope._can_reused: + return scope + scope = core.Scope() + self._scope_cache[program_id].append(scope) + return scope + else: + return core.Scope() + + @LazyInitialized + def _double_grads(self): + # TODO: check the affects. + return None + + # whole + @switch_to_static_graph + def _create_program(self, is_infer_mode=False): + if is_infer_mode: + infer_program = self._origin_main_program.clone( + for_test=is_infer_mode + ) + if self._hooker: + infer_program = self._hooker.after_infer(infer_program) + return infer_program + else: + train_program = self._append_backward_desc( + self._origin_main_program + ) + # Note: Only set grad type once after initializing train program. So we put it here. + self._set_grad_type(self._params, train_program) + return train_program + + @switch_to_static_graph + def _create_amp_program(self, is_infer_mode=False): + amp_program = self._origin_main_program.clone(for_test=is_infer_mode) + with program_guard(amp_program): + paddle.static.amp.fp16_utils.cast_model_to_fp16( + amp_program, self._amp_list, use_fp16_guard=False, level='O1' + ) + if is_infer_mode: + if self._hooker: + amp_program = self._hooker.after_infer(amp_program) + return amp_program + else: + train_amp_program = self._append_backward_desc(amp_program) + self._set_grad_type(self._params, train_amp_program) + return train_amp_program + + @switch_to_static_graph + def _create_pure_fp16_program(self, is_infer_mode=False): + pure_fp16_program = self._origin_main_program.clone( + for_test=is_infer_mode + ) + with program_guard(pure_fp16_program): + paddle.static.amp.fp16_utils.cast_model_to_fp16( + pure_fp16_program, self._amp_list, use_fp16_guard=False + ) + + if is_infer_mode: + if self._hooker: + pure_fp16_program = self._hooker.after_infer(pure_fp16_program) + return pure_fp16_program + else: + train_pure_fp16_program = self._append_backward_desc( + pure_fp16_program + ) + self._set_grad_type(self._params, train_pure_fp16_program) + return train_pure_fp16_program + + @switch_to_static_graph + def _create_forward_backward_train_program(self): + whole_program = self._train_program + forward_end_op_index = self.get_forward_end_op_idx(whole_program) + assert forward_end_op_index >= 0 + return self._get_forward_backward_program_form( + whole_program, forward_end_op_index + ) + + @switch_to_static_graph + def _create_forward_backward_train_amp_program(self): + whole_program = self._train_amp_program + forward_end_op_index = self.get_forward_end_op_idx(whole_program) + assert forward_end_op_index >= 0 + + return self._get_forward_backward_program_form( + whole_program, forward_end_op_index + ) + + @switch_to_static_graph + def _create_forward_backward_train_pure_fp16_program(self): + whole_program = self._train_pure_fp16_program + forward_end_op_index = self.get_forward_end_op_idx(whole_program) + assert forward_end_op_index >= 0 + + return self._get_forward_backward_program_form( + whole_program, forward_end_op_index + ) + + @LazyInitialized + def _train_program(self): + return self._create_program() + + @LazyInitialized + def _infer_program(self): + program, op_size = self._infer_info('fp32', self._create_program) + return self._build_infer_program(program, op_size) + + @LazyInitialized + def _train_amp_program(self): + return self._create_amp_program() + + @LazyInitialized + def _infer_amp_program(self): + program, op_size = self._infer_info('amp', self._create_amp_program) + return self._build_infer_program(program, op_size) + + @LazyInitialized + def _train_pure_fp16_program(self): + return self._create_pure_fp16_program() + + @LazyInitialized + def _infer_pure_fp16_program(self): + program, op_size = self._infer_info( + 'fp16', self._create_pure_fp16_program + ) + return self._build_infer_program(program, op_size) + + @LazyInitialized + def _train_forward_backward_program(self): + program = self._create_forward_backward_train_program() + return program + + @LazyInitialized + def _train_amp_forward_backward_program(self): + program = self._create_forward_backward_train_amp_program() + return program + + @LazyInitialized + def _empty_backward_program_for_eval(self): + return paddle.static.Program() + + @LazyInitialized + def _train_pure_fp16_forward_backward_program(self): + program = self._create_forward_backward_train_pure_fp16_program() + return program + + @LazyInitialized + def _train_program_id(self): + program_id = paddle.utils._hash_with_id(self._train_program, self) + core._set_cached_executor_build_strategy( + program_id, self._build_strategy + ) + return program_id + + @LazyInitialized + def _infer_program_id(self): + return paddle.utils._hash_with_id(self._infer_program, self) + + @LazyInitialized + def _train_amp_program_id(self): + program_id = paddle.utils._hash_with_id(self._train_amp_program, self) + core._set_cached_executor_build_strategy( + program_id, self._build_strategy + ) + return program_id + + @LazyInitialized + def _infer_amp_program_id(self): + return paddle.utils._hash_with_id(self._infer_amp_program, self) + + @LazyInitialized + def _train_pure_fp16_program_id(self): + program_id = paddle.utils._hash_with_id( + self._train_pure_fp16_program, self + ) + core._set_cached_executor_build_strategy( + program_id, self._build_strategy + ) + return program_id + + @LazyInitialized + def _infer_pure_fp16_program_id(self): + return paddle.utils._hash_with_id(self._infer_pure_fp16_program, self) + + def get_forward_end_op_idx(self, program): + return self._program_extra_info[ + paddle.utils._hash_with_id(program, self) + ]['forward_end_op_idx'] + + def get_program_extra(self, program): + if ( + paddle.utils._hash_with_id(program, self) + not in self._program_extra_info + ): + self._program_extra_info[ + paddle.utils._hash_with_id(program, self) + ] = {} + return self._program_extra_info[ + paddle.utils._hash_with_id(program, self) + ] + + @property + def program(self): + """ + Return current train or eval program. + """ + if self.training: + return self.train_program + else: + return self.infer_program + + @property + def program_id(self): + """ + Return current train or eval program hash id. + """ + if self.training: + if _in_amp_guard(): + return self._train_amp_program_id + elif _in_pure_fp16_guard(): + return self._train_pure_fp16_program_id + else: + return self._train_program_id + else: + if _in_amp_guard(): + return self._infer_amp_program_id + elif _in_pure_fp16_guard(): + return self._infer_pure_fp16_program_id + else: + return self._infer_program_id + + @property + def train_program(self): + if _in_amp_guard(): + return self._train_amp_program + elif _in_pure_fp16_guard(): + return self._train_pure_fp16_program + else: + return self._train_program + + @property + def infer_program(self): + if _in_amp_guard(): + return self._infer_amp_program + elif _in_pure_fp16_guard(): + return self._infer_pure_fp16_program + else: + return self._infer_program + + @property + def forward_program(self): + if self.training: + if _in_amp_guard(): + progs = self._train_amp_forward_backward_program + elif _in_pure_fp16_guard(): + progs = self._train_pure_fp16_forward_backward_program + else: + progs = self._train_forward_backward_program + return progs[0] + else: + return self.infer_program + + @property + def backward_program(self): + if self.training: + if _in_amp_guard(): + progs = self._train_amp_forward_backward_program + elif _in_pure_fp16_guard(): + progs = self._train_pure_fp16_forward_backward_program + else: + progs = self._train_forward_backward_program + return progs[1] + else: + """ + Can't just return paddle.static.Program(), because self.backward_program is a property, + whenever we call this method, a tmp Program() object is created and is gc immediatly + after executed the following line in PartialProgramLayer.__call__. + + >>> self.backward_program.desc.block(0), + + When we access RunProgramAPI, it's possible to get an invalid backward_program address. + """ + return self._empty_backward_program_for_eval + + def _verify_program(self, main_program): + """ + Verify that the program parameter is initialized, prune some unused params, + and remove redundant op callstack. + """ + # 1. Check all params from main program can be found in self._params + self._check_params_all_inited(main_program) + # 2. Prune the parameters not used anywhere in the program. + self._prune_unused_params(main_program) + + return main_program + + def prepare_gradient_aggregation( + self, start_idx, main_program, target_program + ): + """ + Why we need add gradient aggregation operation ? + In some cases, if non leaf nodes are used as output, gradient overwriting will occur, such as + def forward(self, in): + x = 2 * in # <---- x is a non-leaf node in program. + y = x + 3 + return x, y + + loss = forward(in)[0].sum() + loss.backward() # <----- x@grad will be overwrited by elementwise_add_grad Op + """ + + def _need_aggregation(var): + """ + if exist a op whose inputs is var, then return True + """ + if not isinstance(var, framework.Variable) or var.type not in [ + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.SELECTED_ROWS, + ]: + return False + if var.dtype not in [paddle.float32, paddle.float64]: + return False + for op in main_program.block(0).ops: + for in_arg in op.input_arg_names: + if in_arg == var.name: + return True + return False + + def _insert_aggregation_ops_for_var(target_program, var): + suffix = "@dy2static" + var_grad_name = var.grad_name + new_grad_name = var.name + suffix + "@GRAD" + finded_ops = list( + filter( + lambda x: x[0] >= start_idx + and any( + out_arg == var_grad_name + for out_arg in x[1].output_arg_names + ), + enumerate(target_program.block(0).ops), + ) + ) + + # len(finded_ops) may equals zero when stop_gradient works. + # len(finded_ops) may > 1, because we may have fill_constant op. + if len(finded_ops) == 0: + return None + # step1: create a new var named var.name@GRAD + target_program.block(0).create_var( + name=new_grad_name, + type=var.type, + dtype=var.dtype, + shape=var.shape, + ) + # step2: rename the var.name@GRAD to var.name@GRAD@dy2static + for idx, op in finded_ops: + op._rename_input(var_grad_name, new_grad_name) + op._rename_output(var_grad_name, new_grad_name) + # step3: insert sum op to aggregate the gradient. + # var.name@GRAD = sum(var.name@dy2static@GRAD, var.name@GRAD) + target_program.block(0)._insert_op( + finded_ops[-1][0] + 1, + type='sum', + inputs={'X': [var_grad_name, new_grad_name]}, + outputs={"Out": var_grad_name}, + ) + return None + + to_processed_vars = list( + filter(_need_aggregation, self._outputs.tolist()) + ) + for _var in to_processed_vars: + _insert_aggregation_ops_for_var(target_program, _var) + + @switch_to_static_graph + def _append_backward_desc(self, main_program): + program = main_program + # if self._hooker: + # program = self._hooker.before_append_backward(program) + targets = list( + filter(lambda x: isinstance(x, OpResult), self._outputs.tolist()) + ) + inputs = list( + filter(lambda x: isinstance(x, OpResult), self._inputs.tolist()) + ) + forward_end_idx = len(program.block().ops) + if targets: + with backend_guard(self._backend): + check_type( + targets, + 'targets', + (OpResult, list, tuple), + 'paddle.static.gradients', + ) + with ir_static.program_guard(program, None): + grad_info_map = grad(inputs=inputs, outputs=targets) + + forward_outputs_grads = [] + not_stop_gradient_num = 0 + for out_op_result in self._outputs.tolist(): + if out_op_result.stop_gradient is True: + forward_outputs_grads.append(None) + continue + opres = ( + program.block() + .ops[forward_end_idx + not_stop_gradient_num] + .results()[0] + ) + forward_outputs_grads.append(opres) + not_stop_gradient_num += 1 + + # TODO: add later. + # if self._hooker: + # program, start_idx = self._hooker.after_append_backward( + # program, start_idx + # ) + + # TODO: add later + # self.prepare_gradient_aggregation( + # start_idx + 1, main_program, program + # ) + + mapping_op_result = ( + lambda x: x if isinstance(x, OpResult) else fake_op_result() + ) + hash_id = paddle.utils._hash_with_id(program, self) + extra_info = self._program_extra_info.get(hash_id, {}) + extra_info['forward_end_op_idx'] = forward_end_idx + extra_info['forward_inputs_grads'] = list( + map(mapping_op_result, grad_info_map) + ) + extra_info['forward_outputs_grads'] = list( + map(mapping_op_result, forward_outputs_grads) + ) + self._program_extra_info[hash_id] = extra_info + + return program + + def _prune_unused_params(self, program): + """ + Prune the parameters not used anywhere in the program. + The `@to_static` may only decorated a sub function which + contains some unused parameters created in `__init__`. + So prune these parameters to avoid unnecessary operations in + `run_program_op`. + """ + required_params = [] + for param in self._params: + found_param = False + for block in program.blocks: + for op in block.ops: + if ( + param.name in op.input_arg_names + or param.name in op.output_arg_names + ): + required_params.append(param) + found_param = True + break + if found_param: + break + + self._params = required_params + + def _cast_fp16_if_pure_fp16(self, in_vars): + if _in_pure_fp16_guard(): + for i, var in enumerate(in_vars): + name = var.name + if ( + self.program.global_block().has_var(name) + and self.program.global_block().var(name).dtype + == paddle.float16 + ): + in_vars[i] = var.astype('float16') + in_vars[i].name = name + + def _prepare_attributes(self): + attrs = [ + 'forward_global_block', + self.forward_program.block(), + 'backward_global_block', + self.backward_program.block(), + 'is_test', + not self.training, + 'program_id', + self.program_id, + ] + + for key, val in self.get_program_extra(self.forward_program)[ + 'program_attr' + ].items(): + attrs.append(key) + attrs.append(val) + + if self._cuda_graph_capture_mode: + attrs.extend( + ( + 'cuda_graph_capture_mode', + self._cuda_graph_capture_mode, + 'cuda_graph_pool_id', + self._cuda_graph_pool_id, + ) + ) + return attrs + + @switch_to_static_graph + def _build_infer_program(self, infer_program, forward_end_op_index): + forward_skip_vars = self._parse_skip_gc_vars(infer_program) + builded_infer_program = add_build_strategy_for( + infer_program, + 0, + forward_end_op_index, + self._build_strategy, + forward_skip_vars, + ) + self._apply_inplace_pass(builded_infer_program, None) + return builded_infer_program + + @switch_to_static_graph + def _get_forward_backward_program_form( + self, whole_program, forward_end_op_index + ): + # NOTE(dev): We apply build_strategy for backward firstly to + # avoid skipping more gc variables. + forward_inputs_grads = self.get_program_extra(whole_program)[ + 'forward_inputs_grads' + ] + forward_inputs = self._inputs.tolist() + forward_outputs = self._outputs.tolist() + forward_outputs_grads = self.get_program_extra(whole_program)[ + 'forward_outputs_grads' + ] + backward_start_op_index = forward_end_op_index + len( + list(filter(lambda r: r.stop_gradient is False, self._outputs)) + ) + backward_end_op_index = len(whole_program.block().ops) + # For Backward process in CINN, all param@GRAD shoule be skipped for GC, because + # they will be shared in scope and used by optimizer. + + # TODO(xiongkun): consider cinn later. + # backward_skip_vars = self._parse_skip_gc_vars( + # whole_program + # ) + self._grad_var_names.get('param', []) + + ( + forward_program, + backward_program, + ), program_attr = paddle.base.libpaddle.ir.program_split( + whole_program, + forward_inputs, + forward_outputs, + forward_inputs_grads, + forward_outputs_grads, + [0, forward_end_op_index], + [backward_start_op_index, backward_end_op_index], + ) + self.get_program_extra(forward_program)["program_attr"] = program_attr + return [forward_program, backward_program] + + def _apply_inplace_pass(self, forward_program, backward_program): + attr_types = { + "use_cuda": "bool", + "mem_opt_skip_vars": "list[str]", + "for_partial_block": "bool", + } + empty_startup_program = paddle.static.Program() + use_cuda = True if core.is_compiled_with_cuda() else False + # skip data var + forward_mem_opt_skip_vars = self._parse_skip_gc_vars( + forward_program, backward_program + ) + backward_mem_opt_skip_vars = self._parse_skip_gc_vars(forward_program) + if forward_program: + attrs = { + "use_cuda": use_cuda, + "mem_opt_skip_vars": forward_mem_opt_skip_vars, + "for_partial_block": True, + } + if not os.getenv("FLAGS_enable_new_ir_in_executor"): + _apply_pass( + forward_program, + empty_startup_program, + "buffer_shared_inplace_pass", + attrs, + attr_types, + ) + if backward_program: + attrs = { + "use_cuda": use_cuda, + "mem_opt_skip_vars": backward_mem_opt_skip_vars, + "for_partial_block": True, + } + if not os.getenv("FLAGS_enable_new_ir_in_executor"): + _apply_pass( + backward_program, + empty_startup_program, + "buffer_shared_inplace_pass", + attrs, + attr_types, + ) + + @LazyInitialized + def _inout_var_names(self): + """ + Returns Variable Names from self._inputs and self.outputs + """ + var_names = [] + for var in self._inputs: + if isinstance(var, paddle.base.framework.Variable): + var_names.append(var.desc.name()) + for var in self._outputs: + if isinstance(var, paddle.base.framework.Variable): + var_names.append(var.desc.name()) + return var_names + + def _parse_skip_gc_vars(self, program, backward_program=None): + """ + Parse variables that need to skip GC after execute it. + If specify backward_program, it will keep the variables used in backward. + """ + # skip data var, DO NOT ignore this deepcopy + skip_vars = deepcopy(self._inout_var_names) + for var_name, var in program.global_block().vars.items(): + if var.is_data: + skip_vars.append(var_name) + + if backward_program: + for var_name in core.parse_safe_eager_deletion_skip_vars( + backward_program.desc, True + ): + skip_vars.append(var_name) + return skip_vars + + def _prepare(self, inputs): + """ + Prepare inputs, outputs, attrs. + """ + assert isinstance(inputs, (tuple, list)) + # Flatten inputs with nested structure into single list. + flatten_inputs = paddle.utils.flatten(inputs) + # Convert variable into Tensor and feed in training data. + input_vars = [] + expected_place = framework._current_expected_place() + for i, value in enumerate(flatten_inputs): + if isinstance(value, np.ndarray): + var = None + var = core.eager.Tensor( + value=value, + persistable=False, + place=expected_place, + zero_copy=True, + ) + elif isinstance(value, core.eager.Tensor): + # NOTE(Aurelius84): If var is on CPUPlace, it will be transformed multi times + # into CUDAPlace when it's as input of multi Ops. so we move it in advance + # to avoid this problem. + if value.stop_gradient and not value.place._equals( + expected_place + ): + var = value._copy_to(expected_place, False) + var.stop_gradient = True + else: + var = value + else: + continue + input_vars.append(var) + + # mapping from name(string) -> Tensor + out_tensor_map = {} + + def create_out(var_id): + var = self._outputs[var_id] + assert isinstance(var, OpResult) + + if id(var) in out_tensor_map: + return out_tensor_map[id(var)] + + if var.is_dense_tensor_type(): + tensor_type = paddle.dtype(7) # LOD TENSOR + else: + tensor_type = paddle.dtype(8) # SELECT ROW TENSOR + + # TODO(xiongkun): more elegent way to do it. + ir_dtype_2_tensor_dtype = { + 10: paddle.dtype(5), + } + out = core.eager.Tensor( + ir_dtype_2_tensor_dtype[int(var.dtype)], + var.shape, + "", + tensor_type, + False, + ) + out.stop_gradient = var.stop_gradient + out_tensor_map[id(var)] = out + return out + + # Create Tensor to receive output data. + out_vars = list(map(create_out, self._outputs.var_ids)) + return input_vars, out_vars + + def _create_scope_vec(self, program_id=None, use_scope_cache=False): + # Hold forward variables + tmp_scope_vec = None + inner_scope = self._get_scope( + program_id=program_id, use_scope_cache=use_scope_cache + ) + tmp_scope_vec = [inner_scope] + return tmp_scope_vec + + def _create_cuda_graph_vec(self): + var = core.eager.Tensor( + core.VarDesc.VarType.FP32, + [], + "cuda_graph", + core.VarDesc.VarType.RAW, + True, + ) + var.stop_gradient = True + return var + + def _update_stop_gradient(self, out_vars): + # Update stop_gradient for all outputs + def set_stop_gradient(var_id, eager_tensor): + var = self._outputs[var_id] + assert isinstance(var, OpResult) + eager_tensor.stop_gradient = var.stop_gradient + return None + + for idx, var in zip(self._outputs.var_ids, out_vars): + set_stop_gradient(idx, var) + + def _restore_out(self, out_vars): + """ + Restores same nested outputs by only replacing the Variable with Tensor. + """ + + flatten_outputs = self._outputs.tolist() + for i, idx in enumerate(self._outputs.var_ids): + flatten_outputs[idx] = out_vars[i] + outs = self._outputs.restore(flatten_outputs) + if outs is not None and len(outs) == 1: + outs = outs[0] + + return outs + + @switch_to_static_graph + def _clone_for_test(self, main_program): + return main_program.clone(for_test=True) + + def _is_no_value(self, var): + if isinstance(var, core.eager.Tensor) and var.shape == [1]: + # NOTE: .numpy() will insert MemcpySync operation, it hits performance. + if var.numpy()[0] == RETURN_NO_VALUE_MAGIC_NUM: + return True + return False + + def _remove_no_value(self, out_vars): + """ + Removes invalid value for various-length return statement + """ + if isinstance(out_vars, core.eager.Tensor): + if self._is_no_value(out_vars): + return None + return out_vars + elif isinstance(out_vars, (tuple, list)): + if isinstance(out_vars, tuple): + res = tuple( + var for var in out_vars if not self._is_no_value(var) + ) + else: + # isinstance(out_vars, list) + res = [var for var in out_vars if not self._is_no_value(var)] + + has_removed = len(out_vars) > len(res) + # len(out_vars) > len(res) means we have removed var. This is + # preventing out_vars is empty or just one element at the beginning + if len(res) == 0 and has_removed: + return None + elif len(res) == 1 and has_removed: + return res[0] + return res + + return out_vars + + def _set_grad_type(self, params, train_program): + # NOTE: if user set sparse gradient mode, the param's gradient + # will be SelectedRows, not LoDTensor. But tracer will just + # set param grad Tensor by forward Tensor(LoDTensor) + # If we don't change grad_var type here, RunProgramOp need + # transform SelectedRows to LoDTensor forcibly, it may not + # be user wanted result. + for param in params: + grad_name = param.name + core.grad_var_suffix() + grad_var = train_program.desc.block(0).find_var(grad_name.encode()) + # NOTE: cannot find var desc maybe no problem, such as in batch_norm + if grad_var is None: + continue + param._set_grad_type(grad_var.type()) + + def _remove_op_call_stack(self, main_program): + """ + Remove op's python call stack with redundant low-level error messages related to + transforamtions to avoid confusing users. + """ + assert isinstance(main_program, framework.Program) + for block in main_program.blocks: + for op in block.ops: + if op.has_attr("op_callstack"): + op._remove_attr("op_callstack") + + return main_program + + def _check_params_all_inited(self, main_program): + """ + Check all params from main program are already initialized, see details as follows: + 1. all parameters in self._params should be type `framework.EagerParamBase` which are created in dygraph. + 2. all parameters from transformed program can be found in self._params. + Because they share same data with EagerParamBase of original dygraph. + """ + if not isinstance(self._params, (list, tuple)): + raise TypeError( + "Type of self._params in PartialProgramLayer should be list or tuple, but received %s." + % type(self._params) + ) + + param_and_buffer_names_set = set() + for i, var in enumerate(self._params): + # self._params constains parameters and buffers with persistable=True. + if not isinstance(var, core.eager.Tensor): + raise TypeError( + 'Type of self._params[{}] in PartialProgramLayer should be Parameter or Variable, but received {}.'.format( + i, type(var) + ) + ) + param_and_buffer_names_set.add(var.name) + + def _valid_vars(self, vars): + return vars if vars else None + + +def partial_program_from(concrete_program, from_method=False): + inputs = concrete_program.inputs + + # NOTE(SigureMo): Remove the first arg `self` from method args. + if inputs and from_method: + inputs = inputs[1:] + + return PartialProgramLayer( + concrete_program.main_program, + inputs, + concrete_program.outputs, + concrete_program.parameters, + **concrete_program.kwargs, + ) + + +@switch_to_static_graph +def add_build_strategy_for( + program, start_op_index, end_op_index, build_strategy=None, skip_vars=None +): + paddle.base.libpaddle.ir.program_split( + program, + ) + if start_op_index < end_op_index: + pass + else: + # can't just create a new program, we need copy the vardesc. + builded_program = ir_static.Program() + return builded_program diff --git a/python/paddle/jit/dy2static/program_translator.py b/python/paddle/jit/dy2static/program_translator.py index 55fc6d55d0e28..303bfbb9a3f4e 100644 --- a/python/paddle/jit/dy2static/program_translator.py +++ b/python/paddle/jit/dy2static/program_translator.py @@ -20,6 +20,7 @@ import warnings import weakref +import paddle.ir.core as ir_static from paddle.base import core, framework from paddle.base.data_feeder import check_type from paddle.base.dygraph.base import ( @@ -30,6 +31,7 @@ from paddle.base.unique_name import UniqueNameGenerator from paddle.base.unique_name import guard as UniqueNameGuard from paddle.framework import in_dynamic_mode +from paddle.ir.core import _use_new_ir_api from paddle.nn.layer import layers from paddle.utils import flatten, gast @@ -46,7 +48,15 @@ create_and_update_origin_info_map, update_op_callstack_with_origin_info, ) -from .partial_program import PartialProgramLayerHook, partial_program_from + +if ir_static._use_new_ir_api(): + from .newir_partial_program import ( + PartialProgramLayerHook, + partial_program_from, + ) +else: + from .partial_program import PartialProgramLayerHook, partial_program_from + from .utils import ( ALREADY_D2S, NO_SHAPE_VAR_TYPE, @@ -1156,6 +1166,106 @@ def __init__( self.name_generator = name_generator self.kwargs = kwargs + @staticmethod + @switch_to_static_graph + def newir_from_func_spec( + func_spec, input_spec, input_kwargs_spec, class_instance, **kwargs + ): + """ + Builds the main_program with specialized inputs and returns outputs + of program as fetch_list. + + Args: + func_spec(FunctionSpec): A FunctionSpec instance for decorated function. + input_spec(list[InputSpec]): + """ + # verify the instance is initialized in imperative mode. + _verify_init_in_dynamic_mode(class_instance) + + # Transforms dygraph function into static function and caches it. + dygraph_function = func_spec.dygraph_function + static_func = convert_to_static(dygraph_function) + # apply pre\post hook for outermost layer + hook_helper = HookHelper( + dygraph_function, class_instance, kwargs.get("with_hook", False) + ) + + main_program, startup_program = ir_static.Program(), ir_static.Program() + # Note: The random seed should be synchronized into cached program + # if set in `fluid.dygraph_guard` because some ops rely on it, such as + # `fluid.layers.dropout`. + + # TODO: new ir has no random seed. + # {{{ + # main_program.random_seed = static.default_main_program().random_seed + # startup_program.random_seed = ( + # framework.default_startup_program().random_seed + # ) }}} + with ir_static.program_guard(main_program, startup_program): + with _to_static_mode_guard_(is_to_static=True): + # 1. Adds `paddle.static.data` layers for input if needed + static_inputs = func_spec.newir_to_static_inputs_with_spec( + input_spec, main_program + ) + _kwargs = func_spec.newir_to_static_inputs_with_spec( + input_kwargs_spec, main_program + ) + if class_instance: + static_inputs = tuple( + [class_instance] + list(static_inputs) + ) + + # 2. Builds program only once and returns the output Variables. + with param_guard( + get_parameters(class_instance, False) + ), param_guard(get_buffers(class_instance, False)): + try: + # only for jit.save, do nothing while train and eval process + inputs = hook_helper.apply_pre_hooks(static_inputs) + if _kwargs: + outputs = static_func(*inputs, **_kwargs) + else: + outputs = static_func(*inputs) + outputs = hook_helper.apply_post_hooks(inputs, outputs) + except BaseException as e: + # NOTE: If e is raised in compile time, e should be attached to ERROR_DATA here. + error.attach_error_data(e) + error_data = getattr(e, error.ERROR_DATA, None) + if error_data: + error_data.raise_new_exception() + raise + + # 3. Gets all ParamBases and buffered VarBases in the function + all_parameters_and_buffers = ( + ProgramTranslator.get_instance()._params_recorder.pop( + main_program + ) + ) + + if outputs is not None: + need_wrap_into_list = ( + not isinstance(outputs, (tuple, list)) + or len(outputs) == 1 + ) + if need_wrap_into_list: + outputs = [outputs] + + # TODO(@xiongkun): support op call stack in new ir? + # main_program = update_op_callstack_with_origin_info(main_program) + + new_name_generator = UniqueNameGenerator() + return ConcreteProgram( + inputs=static_inputs, + outputs=outputs, + parameters=all_parameters_and_buffers, + name_generator=new_name_generator, + function=dygraph_function, + main_program=main_program, + startup_program=startup_program, + **kwargs, + ) + + # TODO(@xiongkun): remove after new ir is switch @staticmethod @switch_to_static_graph def from_func_spec( @@ -1393,13 +1503,22 @@ def _build_once(self, cache_key): # NOTE(xiongkun): Need a global FLAGS to enable/disable fallback enable_fallback = enable_prim try: - concrete_program = ConcreteProgram.from_func_spec( - func_spec=cache_key.function_spec, - input_spec=cache_key.input_args_with_spec, - input_kwargs_spec=cache_key.input_kwargs_with_spec, - class_instance=cache_key.class_instance, - **cache_key.kwargs, - ) + if _use_new_ir_api(): + concrete_program = ConcreteProgram.newir_from_func_spec( + func_spec=cache_key.function_spec, + input_spec=cache_key.input_args_with_spec, + input_kwargs_spec=cache_key.input_kwargs_with_spec, + class_instance=cache_key.class_instance, + **cache_key.kwargs, + ) + else: + concrete_program = ConcreteProgram.from_func_spec( + func_spec=cache_key.function_spec, + input_spec=cache_key.input_args_with_spec, + input_kwargs_spec=cache_key.input_kwargs_with_spec, + class_instance=cache_key.class_instance, + **cache_key.kwargs, + ) except Exception as e: if enable_fallback: warnings.warn( @@ -1429,9 +1548,14 @@ def _build_once(self, cache_key): ) ) - partial_program = partial_program_from( - concrete_program, cache_key.class_instance is not None - ) + if ir_static._use_new_ir_api(): + partial_program = partial_program_from( + concrete_program, cache_key.class_instance is not None + ) + else: # TODO(new_ir): remove later. + partial_program = partial_program_from( + concrete_program, cache_key.class_instance is not None + ) with backend_guard(backend): if core._is_fwd_prim_enabled(): partial_program.set_hooker( diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index dfbdf6d04a62a..d0be94993d9a6 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -688,6 +688,8 @@ def add(x, y, name=None): if in_dynamic_mode(): return _C_ops.add(x, y) else: + if paddle.ir.core._use_new_ir_api(): + return paddle._ir_ops.add(x, y) return _elementwise_op(LayerHelper('elementwise_add', **locals())) diff --git a/test/ir/new_ir/test_new_ir_to_static.py b/test/ir/new_ir/test_new_ir_to_static.py new file mode 100644 index 0000000000000..4c4596984b878 --- /dev/null +++ b/test/ir/new_ir/test_new_ir_to_static.py @@ -0,0 +1,103 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest + +import numpy as np + +import paddle + +os.environ['FLAGS_enable_new_ir_api'] = 'true' # don't work, we should + + +class TestDy2staticNewIR(unittest.TestCase): + def test_basic_network(self): + def func(x): + out = paddle.mean(x) + return out + + static_func = paddle.jit.to_static(func) + x = paddle.randn((3, 3)) + y = paddle.randn((3, 3)) + x.stop_gradient = False + y.stop_gradient = False + ans = func(x) + out = static_func(x) + + np.testing.assert_allclose( + out.numpy(), ans.numpy(), rtol=1e-05, atol=1e-8 + ) + + def test_basic_network_backward(self): + def func(x): + out = paddle.mean(x) + return out + + # ==== dygraph computation ==== + static_func = paddle.jit.to_static(func) + x = paddle.randn((3, 3)) + y = paddle.randn((3, 3)) + x.stop_gradient = False + y.stop_gradient = False + loss = func(x) * 2 + loss.backward() + x_grad_ans = x.grad.numpy() + x.clear_gradient() + + # ==== to static compuatation ==== + out = static_func(x) + out = out * 2 + out.backward() + st_grad = x.grad + + np.testing.assert_allclose( + x_grad_ans, st_grad.numpy(), rtol=1e-05, atol=1e-8 + ) + + +class TestDy2staticNewIR3(unittest.TestCase): + def test_complex_layer(self): + def output_pure_func(x, y): + outx = paddle.mean(x) + outy = paddle.mean(y) + outy.stop_gradient = True + return paddle.add(outx, outy), outy + + def run_function(to_static=True): + import paddle + + # 设置随机种子 + paddle.seed(2023) + # 生成随机数 + x = paddle.randn((10, 10)) + y = paddle.randn((10, 10)) + x.stop_gradient = False + y.stop_gradient = True + func = output_pure_func + if to_static: + func = paddle.jit.to_static(func) + y, y_mean = func(x, y) + loss = y.mean() + loss.backward() + return (y, x.grad) + + for dy, st in zip(run_function(False), run_function(True)): + np.testing.assert_allclose( + dy.numpy(), st.numpy(), rtol=1e-05, atol=1e-8 + ) + + +if __name__ == "__main__": + unittest.main()