From 4c6544a953e756256bc0ee430980fd18d61078d5 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 7 Dec 2022 09:07:09 +0000 Subject: [PATCH 1/5] rewrite convert_to_mixed_precision --- .../fluid/framework/ir/float_to_half_pass.cc | 68 +- .../fluid/framework/ir/float_to_half_pass.h | 18 +- .../inference/analysis/ir_pass_manager.cc | 31 +- .../ir_passes/tensorrt_subgraph_pass.cc | 16 +- .../passes/convert_to_mixed_precision.cc | 805 ++---------------- .../passes/convert_to_mixed_precision.h | 49 +- 6 files changed, 183 insertions(+), 804 deletions(-) diff --git a/paddle/fluid/framework/ir/float_to_half_pass.cc b/paddle/fluid/framework/ir/float_to_half_pass.cc index ec94728fb3c641..9f2098683a95a9 100644 --- a/paddle/fluid/framework/ir/float_to_half_pass.cc +++ b/paddle/fluid/framework/ir/float_to_half_pass.cc @@ -66,6 +66,23 @@ bool GpuKernelSupportPrecision( return support; } +inline bool VarNodeHasDtype(Node* var_node) { + auto type = var_node->Var()->GetType(); + return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) || + (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) || + (type == VarType::VOCAB); +} + +inline bool IsFloatType(VarType::Type type) { + return (type == VarType::FP64) || (type == VarType::FP32); +} + +inline bool IsHalfType(VarType::Type type) { + return (type == VarType::FP16) || (type == VarType::BF16); +} + +}; // namespace + void DoInsertCastOp(Graph* graph, Node* var_node, Node* op_node, @@ -118,23 +135,19 @@ void DoInsertCastOp(Graph* graph, IR_NODE_UNLINK(var_node, op_node); } -inline bool VarNodeHasDtype(Node* var_node) { - auto type = var_node->Var()->GetType(); - return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) || - (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) || - (type == VarType::VOCAB); -} - -inline bool IsFloatType(VarType::Type type) { - return (type == VarType::FP64) || (type == VarType::FP32); -} - -inline bool IsHalfType(VarType::Type type) { - return (type == VarType::FP16) || (type == VarType::BF16); +bool OpSupportPrecision(const std::string& op_type, + phi::Backend backend, + phi::DataType precision, + const std::unordered_set& black_list) { + bool support = false; + if (black_list.count(op_type) == 0) { + if (backend == phi::Backend::GPU) { + support = GpuKernelSupportPrecision(op_type, precision); + } + } + return support; } -}; // namespace - // The set of ops that support fp16 calculation and are considered // numerically-dangerous, slower and whose effects may also be observed in // downstream ops. @@ -172,10 +185,17 @@ void FloatToHalfPass::SetDefaultBlacklist() const { void FloatToHalfPass::Init(Graph* graph) const { keep_io_types_ = true; + if (Has("keep_io_types")) { + keep_io_types_ = Get("keep_io_types"); + } half_precision_ = static_cast(Get("mixed_precision_mode")); black_list_ = Get>("mixed_black_list"); SetDefaultBlacklist(); + VLOG(4) << "black_list has "; + for (const auto& name : black_list_) { + VLOG(4) << " - " << name; + } auto graph_size = graph->SubGraphsSize(); VLOG(4) << "graph size: " << graph_size; @@ -235,18 +255,6 @@ void FloatToHalfPass::ApplyImpl(Graph* graph) const { VLOG(4) << "RestoreOpOriginType done"; } -bool FloatToHalfPass::OpSupportPrecision(const std::string& op_type, - phi::DataType precision, - phi::Backend backend) const { - bool support = false; - if (black_list_.count(op_type) == 0) { - if (backend == phi::Backend::GPU) { - support = GpuKernelSupportPrecision(op_type, precision); - } - } - return support; -} - void FloatToHalfPass::SetOpUniqueType() const { int suffix = 0; for (const auto& nodes : all_op_nodes_) { @@ -328,8 +336,10 @@ void FloatToHalfPass::GetOpPrecision() const { GetOpOriginalType(op_type) == "fetch") { support_half = !keep_io_types_; } else { - support_half = - OpSupportPrecision(GetOpOriginalType(op_type), half_precision_); + support_half = OpSupportPrecision(GetOpOriginalType(op_type), + phi::Backend::GPU, + half_precision_, + black_list_); } if (op_node->Op()->HasAttr("dtype")) { diff --git a/paddle/fluid/framework/ir/float_to_half_pass.h b/paddle/fluid/framework/ir/float_to_half_pass.h index a274dc9a53c61a..89351a363403b1 100644 --- a/paddle/fluid/framework/ir/float_to_half_pass.h +++ b/paddle/fluid/framework/ir/float_to_half_pass.h @@ -46,10 +46,6 @@ class FloatToHalfPass : public FusePassBase { void SetDefaultBlacklist() const; - bool OpSupportPrecision(const std::string& op_type, - phi::DataType precision, - phi::Backend backend = phi::Backend::GPU) const; - void SetOpUniqueType() const; void RestoreOpOriginType() const; @@ -93,6 +89,20 @@ class FloatToHalfPass : public FusePassBase { mutable std::unordered_set vars_convert_to_half_; }; +bool OpSupportPrecision(const std::string& op_type, + phi::Backend backend, + phi::DataType precision, + const std::unordered_set& black_list); + +void DoInsertCastOp(Graph* graph, + Node* var_node, + Node* op_node, + proto::VarType::Type from_type, + proto::VarType::Type to_type, + framework::BlockDesc* block_desc, + int* suffix, + std::unordered_map* cache); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index cbcc48a7f68e85..2068b8abb8375b 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -44,8 +44,10 @@ IRPassManager::IRPassManager(Argument *argument) { void IRPassManager::CreatePasses(Argument *argument, const std::vector &passes) { + // For graph_viz_pass std::string pre_pass; int pass_num = 0; + for (const std::string &pass_name : passes) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); pass->Set("use_varseqlen", new bool(argument->tensorrt_use_varseqlen())); @@ -86,15 +88,6 @@ void IRPassManager::CreatePasses(Argument *argument, argument->tensorrt_tuned_dynamic_shape(); pass->Set("with_dynamic_shape", new bool(with_dynamic_shape)); - // mixed precision related - pass->Set("model_precision", new int(argument->model_precision())); - pass->Set( - "mixed_black_list", - new std::unordered_set(argument->mixed_black_list())); - pass->Set("enable_gpu_half", new bool(argument->enable_gpu_half())); - pass->Set("mixed_precision_mode", - new int(argument->mixed_precision_mode())); - if (pass_name == "graph_viz_pass") { std::string optim_cache_dir = argument->optim_cache_dir(); std::string dot_file_path; @@ -209,10 +202,17 @@ void IRPassManager::CreatePasses(Argument *argument, new std::vector(argument->tensorrt_disabled_ops())); pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla())); pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core())); + // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will // not run fp16. pass->Set("disable_trt_plugin_fp16", new bool(argument->disable_trt_plugin_fp16())); + + // Mixed precision related. + pass->Set("model_precision", new int(argument->model_precision())); + pass->Set( + "mixed_black_list", + new std::unordered_set(argument->mixed_black_list())); } else if (pass_name == "dlnne_subgraph_pass") { auto precision_mode = argument->dlnne_precision_mode(); pass->Set("min_subgraph_size", @@ -235,8 +235,7 @@ void IRPassManager::CreatePasses(Argument *argument, new framework::ProgramDesc *(&argument->main_program())); } else if (pass_name == "memory_optimize_pass") { pass->Set("root_predictor_id", new int(argument->root_predictor_id())); - } - if (pass_name == "lite_subgraph_pass") { + } else if (pass_name == "lite_subgraph_pass") { bool lite_enable_int8 = argument->lite_precision_mode() == AnalysisConfig::Precision::kInt8; pass->Set("program", @@ -284,8 +283,7 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("nnadapter_model_cache_token", new std::vector( argument->nnadapter_model_cache_token())); - } - if (pass_name == "fc_fuse_pass") { + } else if (pass_name == "fc_fuse_pass") { pass->Set("use_gpu", new bool(argument->use_gpu())); bool fc_mkldnn_pass = 0; for (const std::string &pass_n : passes) { @@ -295,6 +293,13 @@ void IRPassManager::CreatePasses(Argument *argument, } bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding(); pass->Set("use_fc_padding", new bool(use_fc_padding)); + } else if (pass_name == "float_to_half_pass") { + pass->Set( + "mixed_black_list", + new std::unordered_set(argument->mixed_black_list())); + pass->Set("enable_gpu_half", new bool(argument->enable_gpu_half())); + pass->Set("mixed_precision_mode", + new int(argument->mixed_precision_mode())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 44249796ec4055..f67891feccc5ce 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -83,14 +83,14 @@ void OutputProcess(framework::ir::Graph *graph, backend, precision, blacklist)) { - AddCastOp(graph, - var_node, - next_op, - framework::proto::VarType::FP32, - to_type, - &suffix, - block_desc, - &var_to_cast_op_map); + InsertCastOp(graph, + var_node, + next_op, + framework::proto::VarType::FP32, + to_type, + block_desc, + &suffix, + &var_to_cast_op_map); var_node->Var()->SetDataType(framework::proto::VarType::FP32); } } diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index afc1d8a882ca6e..e3740ff4e96e56 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -14,662 +14,71 @@ #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h" -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/float_to_half_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/node.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/inference/io.h" -#include "paddle/phi/common/bfloat16.h" -#include "paddle/phi/common/data_type.h" -#include "paddle/phi/common/float16.h" -#include "paddle/phi/common/layout.h" -#include "paddle/phi/common/place.h" +#include "paddle/phi/common/backend.h" namespace paddle { namespace inference { namespace analysis { -namespace { -using VarType = framework::proto::VarType; - -bool PhiKernelSupportPrecision( - const std::string& op_type, +ConvertToMixedPrecisionPass::ConvertToMixedPrecisionPass( + const std::string& model_file, + const std::string& params_file, + const std::string& mixed_model_file, + const std::string& mixed_params_file, + phi::DataType mixed_precision, phi::Backend backend, - phi::DataType data_type, - phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) { - auto kernels = phi::KernelFactory::Instance().kernels(); - if (kernels.find(op_type) == kernels.end()) { - return false; - } - phi::KernelKey kernel_key(backend, layout, data_type); - return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key); -} - -bool GpuKernelSupportPrecision( - const std::string& op_type, - phi::DataType data_type, - phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) { - auto phi_op_type = phi::TransToPhiKernelName(op_type); - bool res = PhiKernelSupportPrecision( - phi_op_type, phi::Backend::GPU, data_type, layout); - res |= PhiKernelSupportPrecision( - phi_op_type, phi::Backend::GPUDNN, data_type, layout); - - if (!res) { - auto& all_kernels = framework::OperatorWithKernel::AllOpKernels(); - auto it = all_kernels.find(op_type); - if (it != all_kernels.end()) { - for (auto& kern_pair : it->second) { - if (platform::is_gpu_place(kern_pair.first.place_) && - kern_pair.first.data_type_ == VarType::FP16) { - res = true; - break; - } - } - } - } - return res; -} - -class ConvertToMixedPrecisionPass { - using BlockID = size_t; - - public: - explicit ConvertToMixedPrecisionPass( - const std::string& model_file, - const std::string& params_file, - const std::string& mixed_model_file, - const std::string& mixed_params_file, - phi::DataType mixed_precision, - phi::Backend backend, - bool keep_io_types, - const std::unordered_set& black_list) - : model_file_(model_file), - params_file_(params_file), - mixed_model_file_(mixed_model_file), - mixed_params_file_(mixed_params_file), - mixed_precision_(mixed_precision), - backend_(backend), - keep_io_types_(keep_io_types), - black_list_(black_list), - place_(paddle::CPUPlace()), - executor_(place_) { - VLOG(4) << "black_list has "; - for (auto& name : black_list_) { - VLOG(4) << " - " << name; - } - } - - void Run(); - - private: - void LoadAndPrepare(); - inline bool VarNodeHasDtype(framework::ir::Node* node); - void ConvertAllFp64ToFp32(framework::ir::Graph* graph); - void FixCastAttr(framework::ir::Graph* graph); - void SaveMixedModel(); - void ConvertTensorDtype(BlockID block_idx); - void ProcessInputNode(bool support_precision, - framework::ir::Node* in_node, - framework::ir::Node* op_node, - int* suffix, - framework::BlockDesc* block_desc, - VarType::Type to_type, - BlockID block_idx); - - void ProcessOutputNode(BlockID block_idx, - framework::ir::Node* var_node, - VarType::Type to_type); - inline bool IsFloatVarType(VarType::Type type); - - bool OutShouldNotConvert(framework::ir::Node* var_node); - // Just process special cases for weights conversion. - bool WeightsShouldNotConvert(framework::ir::Node* var_node); - - // Return Node* which first appers in block. - framework::ir::Node* GetRealVarNode(framework::ir::Node* node); - - // Fallback to fp32 dtype when encounter circle (Not a DAG graph). - void ProcessCircleCases(); - - private: - std::string model_file_; - std::string params_file_; - std::string mixed_model_file_; - std::string mixed_params_file_; - phi::DataType mixed_precision_; - phi::Backend backend_; - bool keep_io_types_; - std::unordered_set black_list_; - paddle::CPUPlace place_; - framework::Executor executor_; - framework::Scope scope_; - - std::unordered_map name2node_; - std::unordered_map cast_map_; - int suffix_{0}; - - std::set var_names_in_circles_; - - std::unique_ptr program_desc_{nullptr}; - std::unique_ptr main_graph_{nullptr}; - std::vector graphes_; -}; - -framework::ir::Node* ConvertToMixedPrecisionPass::GetRealVarNode( - framework::ir::Node* var_node) { - CHECK_EQ(var_node->IsVar(), true); - if (name2node_.count(var_node->Name())) return name2node_[var_node->Name()]; - return var_node; -} - -inline bool ConvertToMixedPrecisionPass::VarNodeHasDtype( - framework::ir::Node* var_node) { - CHECK_EQ(var_node->IsVar(), true); - auto type = var_node->Var()->GetType(); - return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) || - (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) || - (type == VarType::VOCAB); -} - -void ConvertToMixedPrecisionPass::ProcessInputNode( - bool support_precision, - framework::ir::Node* in_node, - framework::ir::Node* op_node, - int* suffix, - framework::BlockDesc* block_desc, - VarType::Type to_type, - BlockID block_idx) { - if (!in_node->IsVar()) return; - auto* real_node = GetRealVarNode(in_node); - if (!VarNodeHasDtype(real_node)) return; - auto* graph = graphes_[block_idx]; - auto* in_var = real_node->Var(); - auto in_var_type = in_var->GetDataType(); - auto prev_type = in_var_type; - - if (support_precision) { - if (in_var->Persistable() && in_var_type == VarType::FP32) { - if (WeightsShouldNotConvert(in_node)) return; - in_var->SetDataType(to_type); - in_var_type = to_type; - VLOG(3) << " in_node name " << in_var->Name() << " from " << prev_type - << " to " << to_type; - } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) && - in_var_type != to_type) { - AddCastOp(graph, - in_node, - op_node, - in_var_type, - to_type, - suffix, - block_desc, - &cast_map_); - VLOG(3) << " in_node name " << in_var->Name() << "(" << prev_type - << ") to " << cast_map_[in_node]->Name() << "(" << to_type << ")"; - } - } else { - if (!in_var->Persistable() && IsFloatVarType(in_var_type) && - in_var_type != to_type) { - AddCastOp(graph, - in_node, - op_node, - in_var_type, - to_type, - suffix, - block_desc, - &cast_map_); - VLOG(3) << " in_node name " << in_var->Name() << "(" << prev_type - << ") to " << cast_map_[in_node]->Name() << "(" << to_type << ")"; - } - } -} - -void ConvertToMixedPrecisionPass::ProcessOutputNode( - BlockID block_idx, framework::ir::Node* var_node, VarType::Type to_type) { - if (!var_node->IsVar()) return; - auto* real_node = GetRealVarNode(var_node); - if (!VarNodeHasDtype(real_node)) return; - auto* out_var = real_node->Var(); - auto prev_type = out_var->GetDataType(); - if (out_var->GetDataType() == VarType::FP32) { - if (OutShouldNotConvert(var_node)) return; - out_var->SetDataType(to_type); - } - VLOG(3) << " out_node name " << var_node->Name() << " from dtype " - << prev_type << " to " << out_var->GetDataType(); -} - -// Just process special cases. -bool ConvertToMixedPrecisionPass::OutShouldNotConvert( - framework::ir::Node* var_node) { - auto op_node = var_node->inputs[0]; - auto* op_desc = op_node->Op(); - - // batch_norm's input and output (variance and mean) are the same. - if (op_desc->Type() == "batch_norm") { - auto vecs = op_desc->Output("MeanOut"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - vecs = op_desc->Output("VarianceOut"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - vecs = op_desc->Output("SavedMean"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - vecs = op_desc->Output("SavedVariance"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } + bool keep_io_types, + const std::unordered_set& black_list) + : model_file_(model_file), + params_file_(params_file), + mixed_model_file_(mixed_model_file), + mixed_params_file_(mixed_params_file), + mixed_precision_(mixed_precision), + backend_(backend), + keep_io_types_(keep_io_types), + black_list_(black_list) { + if (mixed_precision_ != phi::DataType::FLOAT16 && + mixed_precision_ != phi::DataType::BFLOAT16) { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "mixed_precision currently not supported dtype %d, we now only " + "support fp16 and bf16.", + static_cast(mixed_precision_))); } - - return false; -} - -bool ConvertToMixedPrecisionPass::WeightsShouldNotConvert( - framework::ir::Node* var_node) { - auto op_nodes = var_node->outputs; - for (auto* op_node : op_nodes) { - auto* op_desc = op_node->Op(); - // batch_norm op's bias, mean, scale and variance just be float32, so we can - // not convert the dtype. - if (op_desc->Type() == "batch_norm") { - auto vecs = op_desc->Input("Bias"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - vecs = op_desc->Input("Mean"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - vecs = op_desc->Input("Scale"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - vecs = op_desc->Input("Variance"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - } else if (op_desc->Type() == "fused_multi_transformer") { - auto vecs = op_desc->Input("LnScale"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - - vecs = op_desc->Input("LnBias"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - - vecs = op_desc->Input("FFNLnScale"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - - vecs = op_desc->Input("FFNLnBias"); - if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) { - return true; - } - } + if (backend_ != phi::Backend::GPU) { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "mixed_precision currently not supported place %d, we now only " + "support gpu.", + static_cast(backend_))); } - - return false; } -inline bool ConvertToMixedPrecisionPass::IsFloatVarType(VarType::Type type) { - return (type == VarType::FP16) || (type == VarType::FP32) || - (type == VarType::BF16); -} +void ConvertToMixedPrecisionPass::LoadModel() { + framework::Executor exe{platform::CPUPlace{}}; -void ConvertToMixedPrecisionPass::LoadAndPrepare() { - program_desc_ = - inference::Load(&executor_, &scope_, model_file_, params_file_); + auto program_desc = inference::Load(&exe, &scope_, model_file_, params_file_); main_graph_ = std::unique_ptr( - new framework::ir::Graph(*program_desc_)); - - for (size_t i = 0; i < main_graph_->SubGraphsSize(); ++i) { - auto* graph = main_graph_->GetSubGraph(i); - graphes_.push_back(graph); - - for (auto* node : graph->Nodes()) { - if (!node->IsVar()) continue; - if (!name2node_.count(node->Name())) { - name2node_[node->Name()] = node; - } - } - } - - ProcessCircleCases(); -} - -// Find var names which in circles. -void ConvertToMixedPrecisionPass::ProcessCircleCases() { - std::vector vars_in_circles; - for (size_t idx = 0; idx < program_desc_->Size(); ++idx) { - for (auto* op : program_desc_->Block(idx).AllOps()) { - // TODO(inference): batch_norm has circle, but we need to fuse it in conv - // op. - if (op->Type() == "batch_norm") continue; - const auto& in_names = op->InputArgumentNames(); - const auto& out_names = op->OutputArgumentNames(); - std::set in_names_set(in_names.begin(), in_names.end()); - std::set out_names_set(out_names.begin(), out_names.end()); - std::set_intersection(in_names_set.begin(), - in_names_set.end(), - out_names_set.begin(), - out_names_set.end(), - std::back_inserter(vars_in_circles)); - } - } - - for (auto& name : vars_in_circles) { - var_names_in_circles_.insert(name); - } - for (auto& name : var_names_in_circles_) { - LOG(INFO) << name - << " in circles, so we will skip process those vars and ops."; - } -} - -inline void ProcessConstantOpAttr(framework::ir::Node* op_node, - VarType::Type from_type, - VarType::Type to_type) { - if (!op_node->IsOp()) return; - auto op_type = op_node->Op()->Type(); - if (op_type == "feed" || op_type == "fetch") return; - - if (op_type == "fill_constant") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(from_type)) - op_node->Op()->SetAttr("dtype", static_cast(to_type)); - } else if (op_type == "assign_value") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(from_type)) - op_node->Op()->SetAttr("dtype", static_cast(to_type)); - } else if (op_type == "eye") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(from_type)) - op_node->Op()->SetAttr("dtype", static_cast(to_type)); - } else if (op_type == "fill_any_like") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) == - static_cast(from_type)) - op_node->Op()->SetAttr("dtype", static_cast(to_type)); - } else if (op_type == "cast") { - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) == - static_cast(from_type)) - op_node->Op()->SetAttr("in_dtype", static_cast(to_type)); - if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) == - static_cast(from_type)) - op_node->Op()->SetAttr("out_dtype", static_cast(to_type)); - } -} - -void ConvertToMixedPrecisionPass::ConvertAllFp64ToFp32( - framework::ir::Graph* graph) { - auto op_nodes = framework::ir::TopologySortOperations(*graph); - for (auto* op_node : op_nodes) { - if (!op_node->IsOp()) continue; - auto op_type = op_node->Op()->Type(); - ProcessConstantOpAttr(op_node, VarType::FP64, VarType::FP32); - auto inputs = op_node->inputs; - for (auto* in_node : inputs) { - auto* in_var = in_node->Var(); - if (!in_var->Persistable() && in_var->GetDataType() == VarType::FP64) { - in_var->SetDataType(VarType::FP32); - } - } - } + new framework::ir::Graph(*program_desc)); + main_graph_->SetNotOwned(framework::ir::kParamScopeAttr, &scope_); } void ConvertToMixedPrecisionPass::Run() { - LoadAndPrepare(); + LoadModel(); - for (size_t i = 0; i < graphes_.size(); ++i) { - auto* graph = graphes_[i]; - VLOG(2) << " -------- handle subgraph " << i << ", has " - << graph->Nodes().size() << " nodes --------"; + framework::ir::FloatToHalfPass pass; + pass.Set("mixed_precision_mode", new int{static_cast(mixed_precision_)}); + pass.Set("mixed_black_list", + new std::unordered_set{black_list_}); + pass.Set("enable_gpu_half", new bool{true}); + pass.Set("keep_io_types", new bool{keep_io_types_}); - ConvertAllFp64ToFp32(graph); - ConvertTensorDtype(i); - FixCastAttr(graph); - - CHECK_EQ(framework::ir::VarDescIsConsistency(*graph), true); - } + main_graph_.reset(pass.Apply(main_graph_.release())); SaveMixedModel(); } -void ConvertToMixedPrecisionPass::ConvertTensorDtype(BlockID block_idx) { - auto* graph = graphes_[block_idx]; - VarType::Type to_type; - if (mixed_precision_ == phi::DataType::FLOAT16) { - to_type = VarType::FP16; - } else if (mixed_precision_ == phi::DataType::BFLOAT16) { - to_type = VarType::BF16; - } else { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "mixed_precision currently not supported dtype %d, we now only " - "support fp16 and bf16.", - static_cast(mixed_precision_))); - } - - auto op_nodes = framework::ir::TopologySortOperations(*graph); - auto* block_desc = op_nodes[0]->Op()->Block(); - int num_low_precision = 0; - std::vector output_nodes; - - for (auto* op_node : op_nodes) { - if (!op_node->IsOp()) continue; - auto op_type = op_node->Op()->Type(); - VLOG(3) << "-------------------- op_type " << op_type << ", phi_type " - << phi::TransToPhiKernelName(op_type); - // 1. set input dtype. - if (op_type == "feed") { - auto feed_var = op_node->outputs[0]->Var(); - if (!keep_io_types_ && feed_var->GetDataType() == VarType::FP32) { - feed_var->SetDataType(to_type); - } - } else if (op_type == "fetch") { - auto* fetch_var = op_node->inputs[0]; - output_nodes.push_back(fetch_var); - continue; - } else if (op_type == "cast") { - continue; - } - - // We can not add cast operator before ops who have sub_block, as in - // sub_block we may get a var which may be transformer by cast op. - else if (op_node->Op()->HasAttr("sub_block")) { // NOLINT - continue; - } - - // 2. if op support fp16/bf16 and not in blacklist. - // - cast weight to fp16/bf16. - // - add cast op if the input dtype is not fp16/bf16. - // - set output dtype. - else if (black_list_.count(op_type) == 0) { // NOLINT - bool support_precision = - OpSupportPrecision(op_type, backend_, mixed_precision_, black_list_); - - // If op's output in circle, we should not convert to fp16. - for (auto* out_node : op_node->outputs) { - if (var_names_in_circles_.count(out_node->Name())) { - support_precision = false; - VLOG(2) << " op's output " << out_node->Name() - << " is in circle, we can not support this case, just skip."; - break; - } - } - - // If the op has no input or output of float type, we will not choose the - // low precision kernel. - if (support_precision) { - bool has_float_in_out{false}; - for (auto* in_node : op_node->inputs) { - if (!in_node->IsVar()) continue; - if (in_node->Var()->GetType() != VarType::LOD_TENSOR) { - support_precision = false; - VLOG(2) << " op has tensor array input[" << in_node->Name() - << "], just skip."; - break; - } - auto* real_node = GetRealVarNode(in_node); - if (real_node->Var()->GetDataType() == VarType::FP16 || - real_node->Var()->GetDataType() == VarType::FP32 || - real_node->Var()->GetDataType() == VarType::FP64 || - real_node->Var()->GetDataType() == VarType::BF16) { - has_float_in_out = true; - break; - } - } - for (auto* out_node : op_node->outputs) { - if (!out_node->IsVar()) continue; - auto* real_node = GetRealVarNode(out_node); - if (real_node->Var()->GetDataType() == VarType::FP16 || - real_node->Var()->GetDataType() == VarType::FP32 || - real_node->Var()->GetDataType() == VarType::FP64 || - real_node->Var()->GetDataType() == VarType::BF16) { - has_float_in_out = true; - break; - } - } - - if (!has_float_in_out) { - support_precision = false; - VLOG(2) << " op doesn't has float input and output, just skip."; - } - } - - VLOG(2) << "op type: " << op_type - << " support low precision: " << support_precision; - - if (support_precision) { - ProcessConstantOpAttr(op_node, VarType::FP32, to_type); - VLOG(2) << " process input nodes:"; - ++num_low_precision; - auto inputs = op_node->inputs; - for (auto* in_node : inputs) { - ProcessInputNode( - true, in_node, op_node, &suffix_, block_desc, to_type, block_idx); - } - - VLOG(2) << " process output nodes:"; - auto outputs = op_node->outputs; - for (auto* out_node : outputs) { - ProcessOutputNode(block_idx, out_node, to_type); - } - } else { - auto inputs = op_node->inputs; - for (auto* in_node : inputs) { - ProcessInputNode(false, - in_node, - op_node, - &suffix_, - block_desc, - VarType::FP32, - block_idx); - } - } - } - - // 3. check op not support fp16/bf16 or in blacklist. - // - add cast op if the input dtype is not fp32. - else { // NOLINT - VLOG(3) << "not to run fp16 op_type: " << op_type << ", node input size " - << op_node->inputs.size(); - auto in_nodes = op_node->inputs; - for (auto* in_node : in_nodes) { - auto* in_var = in_node->Var(); - if (in_var->GetDataType() == to_type) { - AddCastOp(graph, - in_node, - op_node, - to_type, - VarType::FP32, - &suffix_, - block_desc, - &cast_map_); - VLOG(3) << "-- " << in_node->Name() << "(" << to_type << ") to " - << cast_map_[in_node]->Name() << "(" << VarType::FP32 << ")"; - } - } - } - } - - // 4. if output_op's dtype is not compatible to output dtype, then just - // insert cast. - for (auto* node : output_nodes) { - framework::ir::Node* fetch_op{nullptr}; - for (auto* op_node : node->outputs) { - if (op_node->IsOp() && op_node->Op()->Type() == "fetch") { - fetch_op = op_node; - } - } - CHECK_NOTNULL(fetch_op); - auto* var = node->Var(); - if (keep_io_types_ && var->GetDataType() == to_type) { - // fp16/bf16 -> fp32. - AddCastOp(graph, - node, - fetch_op, - to_type, - VarType::FP32, - &suffix_, - block_desc, - &cast_map_); - } else if (!keep_io_types_ && var->GetDataType() == VarType::FP32) { - // fp32 -> fp16/bf16 - AddCastOp(graph, - node, - fetch_op, - VarType::FP32, - to_type, - &suffix_, - block_desc, - &cast_map_); - } - } - - if (num_low_precision) - LOG(INFO) << "--- detected " << num_low_precision - << " low precision ops in " << block_idx << " subgraph"; -} - -// We modify op's input output precision, and we need to fix cast op in_dtype -// and out_dtype attribute. -// TODO(inference): we need a cast elimination pass. -void ConvertToMixedPrecisionPass::FixCastAttr(framework::ir::Graph* graph) { - auto op_nodes = framework::ir::TopologySortOperations(*graph); - for (auto* op_node : op_nodes) { - if (!op_node->IsOp()) continue; - auto op_type = op_node->Op()->Type(); - if (op_type != "cast") continue; - auto input = op_node->inputs[0]; - auto output = op_node->outputs[0]; - op_node->Op()->SetAttr("in_dtype", - static_cast(input->Var()->GetDataType())); - op_node->Op()->SetAttr("out_dtype", - static_cast(output->Var()->GetDataType())); - } -} - void ConvertToMixedPrecisionPass::SaveMixedModel() { framework::ProgramDesc mixed_program_desc; framework::ir::GraphToProgram(*main_graph_, &mixed_program_desc); @@ -677,51 +86,6 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() { auto parameters = scope_.LocalVarNames(); std::sort(parameters.begin(), parameters.end()); - std::unordered_set weights_should_be_fp32; - for (auto* node : main_graph_->Nodes()) { - if (!node->IsVar()) continue; - if (VarNodeHasDtype(node)) { - if (node->Var()->Persistable() && - node->Var()->GetDataType() == VarType::FP32) { - VLOG(2) << "weights keep to fp32: " << node->Name() << ", ptr " - << reinterpret_cast(node->Var()); - weights_should_be_fp32.insert(node->Name()); - } - } - } - -#define CONVERT_TENSOR_DTYPE(DTYPE, dtype) \ - mixed_tensor.set_type(DTYPE); \ - auto* mixed_data = mixed_tensor.mutable_data(platform::CPUPlace()); \ - for (int64_t i = 0; i < origin_tensor->numel(); i++) { \ - mixed_data[i] = static_cast(origin_data[i]); \ - } \ - origin_tensor->clear(); \ - paddle::framework::TensorCopySync( \ - mixed_tensor, platform::CPUPlace(), origin_tensor) - - for (const auto& param_name : parameters) { - if (weights_should_be_fp32.count(param_name)) continue; - auto* var = scope_.FindLocalVar(param_name); - if (var->IsType()) { - auto* origin_tensor = var->GetMutable(); - if (origin_tensor->dtype() != phi::DataType::FLOAT32) continue; - phi::DenseTensor mixed_tensor; - mixed_tensor.Resize(origin_tensor->dims()); - auto* origin_data = - origin_tensor->mutable_data(platform::CPUPlace()); - if (mixed_precision_ == phi::DataType::FLOAT16) { - CONVERT_TENSOR_DTYPE(paddle::experimental::DataType::FLOAT16, - phi::dtype::float16); - } else if (mixed_precision_ == phi::DataType::BFLOAT16) { - CONVERT_TENSOR_DTYPE(paddle::experimental::DataType::BFLOAT16, - phi::dtype::bfloat16); - } - } - } - -#undef CONVERT_TENSOR_DTYPE - auto SerializeParams = [&]() -> std::string { std::ostringstream os; phi::CPUContext ctx; @@ -746,73 +110,32 @@ void ConvertToMixedPrecisionPass::SaveMixedModel() { mixed_program_desc.Proto()->SerializeAsString()); StrToBinary(mixed_params_file_, SerializeParams()); } -} // namespace - -void AddCastOp( - framework::ir::Graph* graph, - framework::ir::Node* node, - framework::ir::Node* next_op, - VarType::Type from_type, - VarType::Type to_type, - int* suffix, - framework::BlockDesc* block_desc, - std::unordered_map* map) { - auto update_cast_desc = [&](framework::OpDesc& desc, - const std::string& x_name, - const std::string& out_name, - const int in_dtype, - const int out_dtype) { - desc.SetType("cast"); - desc.SetInput("X", {x_name}); - desc.SetOutput("Out", {out_name}); - desc.SetAttr("in_dtype", in_dtype); - desc.SetAttr("out_dtype", out_dtype); - desc.SetAttr("use_mkldnn", false); - desc.SetAttr("with_quant_attr", false); - desc.Flush(); - }; - - if (map->count(node) == 0) { - // insert cast op before node. - std::string cast_input_name = node->Var()->Name(); - std::string cast_output_name = - node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++); - CHECK_NOTNULL(block_desc); - framework::OpDesc cast_op_desc(block_desc); - update_cast_desc(cast_op_desc, - cast_input_name, - cast_output_name, - static_cast(from_type), - static_cast(to_type)); - auto* cast_op_node = graph->CreateOpNode(&cast_op_desc); - auto* cast_output_vardesc = block_desc->Var(cast_output_name); - cast_output_vardesc->SetPersistable(false); - cast_output_vardesc->SetDataType(to_type); - cast_output_vardesc->SetShape(node->Var()->GetShape()); - auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc); - IR_NODE_LINK_TO(cast_op_node, cast_output_node); - (*map)[node] = cast_output_node; - } - next_op->Op()->Rename(node->Name(), map->at(node)->Name()); - IR_NODE_LINK_TO(node, map->at(node)->inputs[0]); - IR_NODE_UNLINK(node, next_op); - IR_NODE_LINK_TO(map->at(node), next_op); -} bool OpSupportPrecision(const std::string& op_type, phi::Backend backend, phi::DataType precision, - const std::unordered_set& blacklist) { - auto phi_op_type = phi::TransToPhiKernelName(op_type); - bool support_precision = false; - if (blacklist.count(op_type) == 0) { - if (backend == phi::Backend::GPU) - support_precision = GpuKernelSupportPrecision(op_type, precision); - else - support_precision = - PhiKernelSupportPrecision(phi_op_type, backend, precision); - } - return support_precision; + const std::unordered_set& black_list) { + return framework::ir::OpSupportPrecision( + op_type, backend, precision, black_list); +} + +void InsertCastOp( + framework::ir::Graph* graph, + framework::ir::Node* var_node, + framework::ir::Node* op_node, + framework::proto::VarType::Type from_type, + framework::proto::VarType::Type to_type, + framework::BlockDesc* block_desc, + int* suffix, + std::unordered_map* visited) { + framework::ir::DoInsertCastOp(graph, + var_node, + op_node, + from_type, + to_type, + block_desc, + suffix, + visited); } void ConvertToMixedPrecision( diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h index 583512408c5869..c3ae63aeeca261 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h @@ -15,14 +15,13 @@ #pragma once #include -#include #include #include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" @@ -30,20 +29,52 @@ namespace paddle { namespace inference { namespace analysis { +class ConvertToMixedPrecisionPass { + public: + explicit ConvertToMixedPrecisionPass( + const std::string& model_file, + const std::string& params_file, + const std::string& mixed_model_file, + const std::string& mixed_params_file, + phi::DataType mixed_precision, + phi::Backend backend, + bool keep_io_types, + const std::unordered_set& black_list); + + void Run(); + + private: + void LoadModel(); + void SaveMixedModel(); + + private: + std::string model_file_; + std::string params_file_; + std::string mixed_model_file_; + std::string mixed_params_file_; + phi::DataType mixed_precision_; + phi::Backend backend_; + bool keep_io_types_; + std::unordered_set black_list_; + + framework::Scope scope_; + std::unique_ptr main_graph_{nullptr}; +}; + bool OpSupportPrecision(const std::string& op_type, phi::Backend backend, phi::DataType precision, - const std::unordered_set& blacklist); + const std::unordered_set& black_list); -void AddCastOp( +void InsertCastOp( framework::ir::Graph* graph, - framework::ir::Node* node, - framework::ir::Node* next_op, + framework::ir::Node* var_node, + framework::ir::Node* op_node, framework::proto::VarType::Type from_type, framework::proto::VarType::Type to_type, - int* suffix, framework::BlockDesc* block_desc, - std::unordered_map* map); + int* suffix, + std::unordered_map* visited); void ConvertToMixedPrecision(const std::string& model_file, const std::string& params_file, From 7e062dd03c0b1dfff2c86fbc19f577ee0abe52bf Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Wed, 7 Dec 2022 18:04:31 +0000 Subject: [PATCH 2/5] fix keep_io_types --- paddle/fluid/framework/ir/float_to_half_pass.cc | 8 +++++++- .../analysis/passes/convert_to_mixed_precision.cc | 3 ++- .../analysis/passes/convert_to_mixed_precision.h | 1 - 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/ir/float_to_half_pass.cc b/paddle/fluid/framework/ir/float_to_half_pass.cc index 9f2098683a95a9..5a20b913ce2069 100644 --- a/paddle/fluid/framework/ir/float_to_half_pass.cc +++ b/paddle/fluid/framework/ir/float_to_half_pass.cc @@ -565,7 +565,11 @@ bool FloatToHalfPass::OutputVarsNotConvert(Node* op_node, void FloatToHalfPass::SetVarPrecision() const { for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { - if (op_run_half_.count(op_node->Op()->Type())) { + if (op_run_half_.count(op_node->Op()->Type()) == 0) { + continue; + } + + if (GetOpOriginalType(op_node->Op()->Type()) != "feed") { for (auto* in_var_node : op_node->inputs) { CHECK_EQ(in_var_node->IsVar(), true); @@ -582,7 +586,9 @@ void FloatToHalfPass::SetVarPrecision() const { vars_convert_to_half_.insert(in_var_name); } } + } + if (GetOpOriginalType(op_node->Op()->Type()) != "fetch") { for (auto* out_var_node : op_node->outputs) { CHECK_EQ(out_var_node->IsVar(), true); diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index e3740ff4e96e56..906c745762dfda 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h" +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/ir/float_to_half_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/inference/io.h" @@ -74,7 +75,7 @@ void ConvertToMixedPrecisionPass::Run() { pass.Set("enable_gpu_half", new bool{true}); pass.Set("keep_io_types", new bool{keep_io_types_}); - main_graph_.reset(pass.Apply(main_graph_.release())); + pass.Apply(main_graph_.get()); SaveMixedModel(); } diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h index c3ae63aeeca261..3a1e5fbb30a21d 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h @@ -18,7 +18,6 @@ #include #include "paddle/fluid/framework/block_desc.h" -#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" From a9ff7a88a2b3306c9c4d27fcf27c8767a7c8b8d4 Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Fri, 9 Dec 2022 12:04:43 +0000 Subject: [PATCH 3/5] update --- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- ...f_pass.cc => auto_mixed_precision_pass.cc} | 260 +++++++++--------- ...alf_pass.h => auto_mixed_precision_pass.h} | 20 +- .../ir/conv2d_fusion_layout_transfer_pass.cc | 2 +- .../ir/conv_elementwise_add_act_fuse_pass.cc | 2 +- paddle/fluid/inference/analysis/argument.h | 2 +- .../inference/analysis/ir_pass_manager.cc | 7 +- .../inference/analysis/passes/CMakeLists.txt | 2 +- .../passes/convert_to_mixed_precision.cc | 6 +- paddle/fluid/inference/api/analysis_config.cc | 8 +- .../fluid/inference/api/analysis_predictor.cc | 8 +- .../inference/api/paddle_analysis_config.h | 2 +- .../inference/api/paddle_pass_builder.cc | 2 +- 13 files changed, 159 insertions(+), 164 deletions(-) rename paddle/fluid/framework/ir/{float_to_half_pass.cc => auto_mixed_precision_pass.cc} (75%) rename paddle/fluid/framework/ir/{float_to_half_pass.h => auto_mixed_precision_pass.h} (84%) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index f65db53893038c..c3685ea1d3669b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -103,7 +103,7 @@ pass_library(delete_c_identity_op_pass inference) pass_library(preln_residual_bias_fuse_pass inference) pass_library(delete_fill_constant_op_pass inference) pass_library(constant_folding_pass inference) -pass_library(float_to_half_pass inference) +pass_library(auto_mixed_precision_pass inference) pass_library(conv2d_fusion_layout_transfer_pass inference) pass_library(simplify_with_basic_ops_pass base) pass_library(fc_elementwise_layernorm_fuse_pass base) diff --git a/paddle/fluid/framework/ir/float_to_half_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc similarity index 75% rename from paddle/fluid/framework/ir/float_to_half_pass.cc rename to paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index a4b35267b5fe1a..48b1de077bf200 100644 --- a/paddle/fluid/framework/ir/float_to_half_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/float_to_half_pass.h" +#include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/operator.h" @@ -29,7 +29,7 @@ namespace ir { namespace { -using VarType = FloatToHalfPass::VarType; +using VarType = AutoMixedPrecisionPass::VarType; bool PhiKernelSupportPrecision( const std::string& op_type, @@ -148,6 +148,9 @@ bool OpSupportPrecision(const std::string& op_type, if (black_list.count(op_type) == 0) { if (backend == phi::Backend::GPU) { support = GpuKernelSupportPrecision(op_type, precision); + } else { + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Now, only support backend of GPU.")); } } return support; @@ -156,7 +159,7 @@ bool OpSupportPrecision(const std::string& op_type, // The set of ops that support fp16 calculation and are considered // numerically-dangerous, slower and whose effects may also be observed in // downstream ops. -void FloatToHalfPass::SetDefaultBlacklist() const { +void AutoMixedPrecisionPass::SetDefaultBlacklist() const { black_list_.insert({ // numerically-dangerous "acos", @@ -188,13 +191,16 @@ void FloatToHalfPass::SetDefaultBlacklist() const { }); } -void FloatToHalfPass::Init(Graph* graph) const { - keep_io_types_ = true; - if (Has("keep_io_types")) { - keep_io_types_ = Get("keep_io_types"); +void AutoMixedPrecisionPass::Init(Graph* graph) const { + bool enable_gpu_mixed = Get("enable_gpu_mixed"); + if (enable_gpu_mixed) { + backend_ = phi::Backend::GPU; } - half_precision_ = - static_cast(Get("mixed_precision_mode")); + + skip_pass_ = !enable_gpu_mixed; + + low_precision_ = static_cast(Get("low_precision_mode")); + black_list_ = Get>("mixed_black_list"); SetDefaultBlacklist(); VLOG(4) << "black_list has "; @@ -202,6 +208,11 @@ void FloatToHalfPass::Init(Graph* graph) const { VLOG(4) << " - " << name; } + keep_io_types_ = true; + if (Has("keep_io_types")) { + keep_io_types_ = Get("keep_io_types"); + } + auto graph_size = graph->SubGraphsSize(); VLOG(4) << "graph size: " << graph_size; subgraphes_.resize(graph_size); @@ -224,24 +235,27 @@ void FloatToHalfPass::Init(Graph* graph) const { } } -void FloatToHalfPass::ApplyImpl(Graph* graph) const { - auto enable_gpu_half = Get("enable_gpu_half"); - if (!enable_gpu_half) return; - - PADDLE_ENFORCE_NOT_NULL( - graph, - platform::errors::PreconditionNotMet( - "During the float to half pass, the graph should not be nullptr.")); - PADDLE_ENFORCE_EQ( - graph->IsMainGraph(), - true, - platform::errors::PreconditionNotMet( - "During the float to half pass, the graph should be main graph.")); +void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL(graph, + platform::errors::PreconditionNotMet( + "During the auto_low_precision_pass, the graph " + "should not be nullptr.")); + PADDLE_ENFORCE_EQ(graph->IsMainGraph(), + true, + platform::errors::PreconditionNotMet( + "During the auto_low_precision_pass, the graph " + "should be main graph.")); - FusePassBase::Init("float_to_half", graph); + FusePassBase::Init("auto_mixed_precision", graph); Init(graph); VLOG(4) << "Init done"; + + if (skip_pass_) { + VLOG(3) << "Skip apply auto_low_precision_pass."; + return; + } + SetOpUniqueType(); VLOG(4) << "SetOpUniqueType done"; GetOpPrecision(); @@ -260,7 +274,7 @@ void FloatToHalfPass::ApplyImpl(Graph* graph) const { VLOG(4) << "RestoreOpOriginType done"; } -void FloatToHalfPass::SetOpUniqueType() const { +void AutoMixedPrecisionPass::SetOpUniqueType() const { int suffix = 0; for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { @@ -277,7 +291,7 @@ void FloatToHalfPass::SetOpUniqueType() const { } } -void FloatToHalfPass::RestoreOpOriginType() const { +void AutoMixedPrecisionPass::RestoreOpOriginType() const { for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { auto op_type = op_node->Op()->Type(); @@ -289,7 +303,7 @@ void FloatToHalfPass::RestoreOpOriginType() const { } } -inline std::string FloatToHalfPass::GetOpOriginalType( +inline std::string AutoMixedPrecisionPass::GetOpOriginalType( const std::string& op_type) const { if (op_original_type_.count(op_type)) { return op_original_type_.at(op_type); @@ -297,22 +311,21 @@ inline std::string FloatToHalfPass::GetOpOriginalType( return op_type; } -void FloatToHalfPass::ProcessOpWithDtypeAttr() const { +void AutoMixedPrecisionPass::ProcessOpWithDtypeAttr() const { for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { auto op_type = op_node->Op()->Type(); - if (op_run_half_.count(op_type) == 0) continue; + if (op_run_low_precision_.count(op_type) == 0) continue; if (op_node->Op()->HasAttr("dtype")) { auto dtype = op_node->Op()->GetAttrIfExists("dtype"); if (IsFloatType(static_cast(dtype))) { op_node->Op()->SetAttr( "dtype", - static_cast( - framework::TransToProtoVarType(half_precision_))); + static_cast(framework::TransToProtoVarType(low_precision_))); op_node->Op()->Flush(); VLOG(4) << "process op with dtype attr: " << op_type << " ( " << dtype - << " --->" << static_cast(half_precision_) << " )"; + << " --->" << static_cast(low_precision_) << " )"; } } if (op_node->Op()->HasAttr("out_dtype")) { @@ -320,11 +333,10 @@ void FloatToHalfPass::ProcessOpWithDtypeAttr() const { if (IsFloatType(static_cast(out_dtype))) { op_node->Op()->SetAttr( "out_dtype", - static_cast( - framework::TransToProtoVarType(half_precision_))); + static_cast(framework::TransToProtoVarType(low_precision_))); op_node->Op()->Flush(); VLOG(4) << "process op with out_dtype attr: " << op_type << " ( " - << out_dtype << " --->" << static_cast(half_precision_) + << out_dtype << " --->" << static_cast(low_precision_) << " )"; } } @@ -332,39 +344,39 @@ void FloatToHalfPass::ProcessOpWithDtypeAttr() const { } } -void FloatToHalfPass::GetOpPrecision() const { +void AutoMixedPrecisionPass::GetOpPrecision() const { for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { auto op_type = op_node->Op()->Type(); - bool support_half = true; + bool support_low_precision = true; if (GetOpOriginalType(op_type) == "feed" || GetOpOriginalType(op_type) == "fetch") { - support_half = !keep_io_types_; + support_low_precision = !keep_io_types_; } else { - support_half = OpSupportPrecision(GetOpOriginalType(op_type), - phi::Backend::GPU, - half_precision_, - black_list_); + support_low_precision = OpSupportPrecision( + GetOpOriginalType(op_type), backend_, low_precision_, black_list_); } if (op_node->Op()->HasAttr("dtype")) { auto dtype = op_node->Op()->GetAttrIfExists("dtype"); - support_half = - support_half && IsFloatType(static_cast(dtype)); + support_low_precision = support_low_precision && + IsFloatType(static_cast(dtype)); } else if (op_node->Op()->HasAttr("out_dtype")) { auto out_dtype = op_node->Op()->GetAttrIfExists("out_dtype"); - support_half = - support_half && IsFloatType(static_cast(out_dtype)); + support_low_precision = + support_low_precision && + IsFloatType(static_cast(out_dtype)); } else { // if op's input var and output var is not dense tensor, the op should - // not run half. + // not run at low precision. for (auto* in_var_node : op_node->inputs) { CHECK_EQ(in_var_node->IsVar(), true); auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()]; if (real_in_var_node->Var()->Persistable()) continue; - support_half = support_half && (real_in_var_node->Var()->GetType() == - VarType::LOD_TENSOR); + support_low_precision = + support_low_precision && + (real_in_var_node->Var()->GetType() == VarType::LOD_TENSOR); } for (auto* out_var_node : op_node->outputs) { @@ -372,23 +384,25 @@ void FloatToHalfPass::GetOpPrecision() const { auto* real_out_var_node = real_vars_[out_var_node->Var()->Name()]; if (real_out_var_node->Var()->Persistable()) continue; - support_half = support_half && (real_out_var_node->Var()->GetType() == - VarType::LOD_TENSOR); + support_low_precision = + support_low_precision && + (real_out_var_node->Var()->GetType() == VarType::LOD_TENSOR); } } - if (support_half) { - op_run_half_.insert(op_type); - VLOG(4) << "support precision: " << op_type << " run at half"; + if (support_low_precision) { + op_run_low_precision_.insert(op_type); + VLOG(4) << "support precision: " << op_type << " run at low precision"; } else { - VLOG(4) << "support precision: " << op_type << " not run at half"; + VLOG(4) << "support precision: " << op_type + << " not run at low precision"; } } } } -void FloatToHalfPass::UpdateOpPrecision() const { - std::unordered_set vars_should_not_half; +void AutoMixedPrecisionPass::UpdateOpPrecision() const { + std::unordered_set vars_should_not_low_precision; // var -> the var's all input op std::unordered_map> var_input_ops; @@ -411,30 +425,16 @@ void FloatToHalfPass::UpdateOpPrecision() const { << " is output of " << op_type; } - // the select_input op's input var should not convert to half. when - // op's output var is select_input op's input var, the op should not run - // half. + // the select_input op's input var should not convert to low precision. + // when op's output var is select_input op's input var, the op should + // not run at low precision. if (GetOpOriginalType(op_node->Op()->Type()) == "select_input") { for (auto* in_var_node : op_node->inputs) { CHECK_EQ(in_var_node->IsVar(), true); if (in_var_node->Var()->Persistable()) continue; if (!VarNodeHasDtype(in_var_node)) continue; - vars_should_not_half.insert(in_var_node->Var()->Name()); - } - } - - // when op_1 only support cpu kernel. if op_2's intput var is op_1's - // output var, then op_2 should not run half. - if (GetOpOriginalType(op_type) != "feed" && - !GpuKernelSupportPrecision(GetOpOriginalType(op_type), - phi::DataType::FLOAT32)) { - for (auto* out_var_node : op_node->outputs) { - CHECK_EQ(out_var_node->IsVar(), true); - if (out_var_node->Var()->Persistable()) continue; - if (!VarNodeHasDtype(out_var_node)) continue; - - vars_should_not_half.insert(out_var_node->Var()->Name()); + vars_should_not_low_precision.insert(in_var_node->Var()->Name()); } } } @@ -447,25 +447,7 @@ void FloatToHalfPass::UpdateOpPrecision() const { precision_updated = false; for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { - if (op_run_half_.count(op_node->Op()->Type()) == 0) continue; - - for (auto* in_var_node : op_node->inputs) { - CHECK_EQ(in_var_node->IsVar(), true); - if (!VarNodeHasDtype(in_var_node)) continue; - - auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()]; - if (real_in_var_node->Var()->Persistable()) continue; - - if (vars_should_not_half.count(real_in_var_node->Var()->Name())) { - op_run_half_.erase(op_node->Op()->Type()); - precision_updated = true; - VLOG(4) << op_node->Op()->Type() - << " should not support half precision."; - break; - } - } - - if (op_run_half_.count(op_node->Op()->Type()) == 0) continue; + if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) continue; for (auto* out_var_node : op_node->outputs) { CHECK_EQ(out_var_node->IsVar(), true); @@ -474,24 +456,25 @@ void FloatToHalfPass::UpdateOpPrecision() const { auto* real_out_var_node = real_vars_[out_var_node->Var()->Name()]; if (real_out_var_node->Var()->Persistable()) continue; - bool not_run_half = false; + bool not_run_low_precision = false; const auto& input_op_nodes = var_input_ops[real_out_var_node->Var()->Name()]; - if (vars_should_not_half.count(real_out_var_node->Var()->Name())) { - not_run_half = true; + if (vars_should_not_low_precision.count( + real_out_var_node->Var()->Name())) { + not_run_low_precision = true; } else { for (auto* node : input_op_nodes) { - if (op_run_half_.count(node->Op()->Type()) == 0) { - not_run_half = true; + if (op_run_low_precision_.count(node->Op()->Type()) == 0) { + not_run_low_precision = true; break; } } } - if (not_run_half) { - op_run_half_.erase(op_node->Op()->Type()); + if (not_run_low_precision) { + op_run_low_precision_.erase(op_node->Op()->Type()); precision_updated = true; VLOG(4) << op_node->Op()->Type() - << " should not support half precision."; + << " should not run at low precision."; break; } } @@ -501,8 +484,8 @@ void FloatToHalfPass::UpdateOpPrecision() const { } // special ops, its weights should not be low precision. -bool FloatToHalfPass::InputVarsNotConvert(Node* op_node, - const std::string& var_name) const { +bool AutoMixedPrecisionPass::InputVarsNotConvert( + Node* op_node, const std::string& var_name) const { auto* op_desc = op_node->Op(); if (GetOpOriginalType(op_desc->Type()) == "batch_norm") { auto vecs = op_desc->Input("Bias"); @@ -542,8 +525,8 @@ bool FloatToHalfPass::InputVarsNotConvert(Node* op_node, return false; } -bool FloatToHalfPass::OutputVarsNotConvert(Node* op_node, - const std::string& var_name) const { +bool AutoMixedPrecisionPass::OutputVarsNotConvert( + Node* op_node, const std::string& var_name) const { auto* op_desc = op_node->Op(); // batch_norm's input and output (variance and mean) are the same. if (GetOpOriginalType(op_desc->Type()) == "batch_norm") { @@ -567,10 +550,10 @@ bool FloatToHalfPass::OutputVarsNotConvert(Node* op_node, return false; } -void FloatToHalfPass::SetVarPrecision() const { +void AutoMixedPrecisionPass::SetVarPrecision() const { for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { - if (op_run_half_.count(op_node->Op()->Type()) == 0) { + if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) { continue; } @@ -587,8 +570,8 @@ void FloatToHalfPass::SetVarPrecision() const { if (real_in_var_node->Var()->Persistable()) { real_in_var_node->Var()->SetDataType( - framework::TransToProtoVarType(half_precision_)); - vars_convert_to_half_.insert(in_var_name); + framework::TransToProtoVarType(low_precision_)); + vars_convert_to_low_precision_.insert(in_var_name); } } } @@ -605,9 +588,9 @@ void FloatToHalfPass::SetVarPrecision() const { if (OutputVarsNotConvert(op_node, out_var_name)) continue; real_out_var_node->Var()->SetDataType( - framework::TransToProtoVarType(half_precision_)); + framework::TransToProtoVarType(low_precision_)); if (real_out_var_node->Var()->Persistable()) { - vars_convert_to_half_.insert(out_var_name); + vars_convert_to_low_precision_.insert(out_var_name); } } } @@ -622,24 +605,24 @@ void FloatToHalfPass::SetVarPrecision() const { if (!VarNodeHasDtype(var_node)) continue; auto var_name = var_node->Var()->Name(); - if (vars_convert_to_half_.count(var_name)) { + if (vars_convert_to_low_precision_.count(var_name)) { var_node->Var()->SetDataType( - framework::TransToProtoVarType(half_precision_)); + framework::TransToProtoVarType(low_precision_)); } } } } -void FloatToHalfPass::ConvertWeightsData() const { +void AutoMixedPrecisionPass::ConvertWeightsData() const { auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, - platform::errors::PreconditionNotMet( - "During the float to half pass, the scope should not be null.")); + PADDLE_ENFORCE_NOT_NULL(scope, + platform::errors::PreconditionNotMet( + "During the auto_low_precision_pass, the scope " + "should not be null.")); auto var_names = scope->LocalVarNames(); for (const auto& var_name : var_names) { - if (vars_convert_to_half_.count(var_name)) { + if (vars_convert_to_low_precision_.count(var_name)) { VLOG(4) << var_name << "'s data type was convert to half"; auto* var = scope->FindLocalVar(var_name); @@ -647,25 +630,29 @@ void FloatToHalfPass::ConvertWeightsData() const { auto* origin_tensor = var->GetMutable(); - phi::DenseTensor half_tensor; - half_tensor.Resize(origin_tensor->dims()); - half_tensor.set_type(half_precision_); + phi::DenseTensor low_precision_tensor; + low_precision_tensor.Resize(origin_tensor->dims()); + low_precision_tensor.set_type(low_precision_); - if (half_precision_ == phi::DataType::FLOAT16) { - auto* half_data = - half_tensor.mutable_data(phi::CPUPlace{}); + if (low_precision_ == phi::DataType::FLOAT16) { + auto* low_precision_data = + low_precision_tensor.mutable_data( + phi::CPUPlace{}); for (int64_t i = 0; i < origin_tensor->numel(); i++) { if (origin_tensor->dtype() == phi::DataType::FLOAT64) { auto* origin_data = origin_tensor->data(); - half_data[i] = static_cast(origin_data[i]); + low_precision_data[i] = + static_cast(origin_data[i]); } else if (origin_tensor->dtype() == phi::DataType::FLOAT32) { auto* origin_data = origin_tensor->data(); - half_data[i] = static_cast(origin_data[i]); + low_precision_data[i] = + static_cast(origin_data[i]); } } - } else if (half_precision_ == phi::DataType::BFLOAT16) { + } else if (low_precision_ == phi::DataType::BFLOAT16) { auto* half_data = - half_tensor.mutable_data(phi::CPUPlace{}); + low_precision_tensor.mutable_data( + phi::CPUPlace{}); for (int64_t i = 0; i < origin_tensor->numel(); i++) { if (origin_tensor->dtype() == phi::DataType::FLOAT64) { auto* origin_data = origin_tensor->data(); @@ -678,12 +665,12 @@ void FloatToHalfPass::ConvertWeightsData() const { } origin_tensor->clear(); paddle::framework::TensorCopySync( - half_tensor, phi::CPUPlace{}, origin_tensor); + low_precision_tensor, phi::CPUPlace{}, origin_tensor); } } } -void FloatToHalfPass::InsertCastOp() const { +void AutoMixedPrecisionPass::InsertCastOp() const { int suffix = 0; std::unordered_map cache; @@ -697,7 +684,7 @@ void FloatToHalfPass::InsertCastOp() const { if (op_node->Op()->HasAttr("sub_block")) continue; VLOG(4) << "process op: " << op_type - << " run half: " << op_run_half_.count(op_type); + << " run low precision: " << op_run_low_precision_.count(op_type); auto inputs = op_node->inputs; for (auto* in_var_node : inputs) { @@ -712,17 +699,17 @@ void FloatToHalfPass::InsertCastOp() const { VLOG(4) << "process var: " << real_in_var_node->Var()->Name() << " with type " << in_var_type; - if (IsFloatType(in_var_type) && op_run_half_.count(op_type)) { + if (IsFloatType(in_var_type) && op_run_low_precision_.count(op_type)) { DoInsertCastOp(subgraphes_[i], in_var_node, op_node, in_var_type, - framework::TransToProtoVarType(half_precision_), + framework::TransToProtoVarType(low_precision_), block_desc, &suffix, &cache); } else if (IsHalfType(in_var_type) && - op_run_half_.count(op_type) == 0) { + op_run_low_precision_.count(op_type) == 0) { DoInsertCastOp(subgraphes_[i], in_var_node, op_node, @@ -754,4 +741,5 @@ void FloatToHalfPass::InsertCastOp() const { } // namespace framework } // namespace paddle -REGISTER_PASS(float_to_half_pass, paddle::framework::ir::FloatToHalfPass); +REGISTER_PASS(auto_low_precision_pass, + paddle::framework::ir::AutoMixedPrecisionPass); diff --git a/paddle/fluid/framework/ir/float_to_half_pass.h b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h similarity index 84% rename from paddle/fluid/framework/ir/float_to_half_pass.h rename to paddle/fluid/framework/ir/auto_mixed_precision_pass.h index c15755896c32b1..578d47282b76d4 100644 --- a/paddle/fluid/framework/ir/float_to_half_pass.h +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h @@ -27,13 +27,13 @@ namespace paddle { namespace framework { namespace ir { -class FloatToHalfPass : public FusePassBase { +class AutoMixedPrecisionPass : public FusePassBase { public: using VarType = framework::proto::VarType; public: - FloatToHalfPass() = default; - ~FloatToHalfPass() = default; + AutoMixedPrecisionPass() = default; + ~AutoMixedPrecisionPass() = default; protected: void ApplyImpl(Graph* graph) const override; @@ -66,9 +66,13 @@ class FloatToHalfPass : public FusePassBase { void ConvertWeightsData() const; private: - mutable bool keep_io_types_; + mutable bool skip_pass_{false}; + + mutable bool keep_io_types_{false}; // float16 or bfloat16 now - mutable phi::DataType half_precision_; + mutable phi::DataType low_precision_{phi::DataType::FLOAT16}; + + mutable phi::Backend backend_{phi::Backend::GPU}; mutable std::unordered_set black_list_; @@ -80,10 +84,10 @@ class FloatToHalfPass : public FusePassBase { mutable std::vector> all_op_nodes_; // op's unique type -> the op's origin type mutable std::unordered_map op_original_type_; - // op's unique type -> whether the op run at half precision - mutable std::unordered_set op_run_half_; + // op's unique type -> whether the op run at low precision + mutable std::unordered_set op_run_low_precision_; - mutable std::unordered_set vars_convert_to_half_; + mutable std::unordered_set vars_convert_to_low_precision_; }; bool OpSupportPrecision(const std::string& op_type, diff --git a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc index dbba001d521015..efed7dd6e637bc 100644 --- a/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc +++ b/paddle/fluid/framework/ir/conv2d_fusion_layout_transfer_pass.cc @@ -142,7 +142,7 @@ void Conv2dFusionLayoutTransferPass::ApplyImpl(ir::Graph *graph) const { bool is_fp16_precision = static_cast(Get("model_precision")) == phi::DataType::FLOAT16 || - Get("enable_gpu_half"); + Get("enable_gpu_mixed"); bool cutlass_enable = false; #ifdef PADDLE_WITH_CUTLASS diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc index 063eb90d90af17..2f527ff1e707bb 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -165,7 +165,7 @@ void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { bool is_fp16_precision = static_cast(Get("model_precision")) == phi::DataType::FLOAT16 || - Get("enable_gpu_half"); + Get("enable_gpu_mixed"); constexpr int CUTLASS_NHWC_ALIGNMENT = 8; if (is_fp16_precision) { #ifdef PADDLE_WITH_CUTLASS diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index a72c1fe7622136..fea343f69e7e5e 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -365,7 +365,7 @@ struct Argument { DECL_ARGUMENT_FIELD(mixed_black_list, MixedBlackList, std::unordered_set); - DECL_ARGUMENT_FIELD(enable_gpu_half, EnableGPUHalf, bool); + DECL_ARGUMENT_FIELD(enable_gpu_mixed, EnableGPUMixed, bool); DECL_ARGUMENT_FIELD(mixed_precision_mode, MixedPrecisionMode, int); // cinn compiler related diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index da393502ea1625..b4c24cf9b8bd77 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -296,13 +296,16 @@ void IRPassManager::CreatePasses(Argument *argument, } bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding(); pass->Set("use_fc_padding", new bool(use_fc_padding)); - } else if (pass_name == "float_to_half_pass") { + } else if (pass_name == "auto_mixed_precision_pass") { pass->Set( "mixed_black_list", new std::unordered_set(argument->mixed_black_list())); - pass->Set("enable_gpu_half", new bool(argument->enable_gpu_half())); + pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed())); pass->Set("mixed_precision_mode", new int(argument->mixed_precision_mode())); + } else if (pass_name == "conv_elementwise_add_act_fuse_pass" || + pass_name == "conv2d_fusion_layout_transfer_pass") { + pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index fa074f962eb3d4..96121601cb6fdb 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -13,7 +13,7 @@ cc_library( cc_library( convert_to_mixed_precision SRCS convert_to_mixed_precision.cc - DEPS analysis_pass ir_graph_build_pass) + DEPS analysis_pass ir_graph_build_pass auto_mixed_precision_pass) cc_library( ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index 906c745762dfda..f1939fc8b328b8 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -15,7 +15,7 @@ #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h" #include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/ir/float_to_half_pass.h" +#include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/inference/io.h" #include "paddle/phi/common/backend.h" @@ -68,11 +68,11 @@ void ConvertToMixedPrecisionPass::LoadModel() { void ConvertToMixedPrecisionPass::Run() { LoadModel(); - framework::ir::FloatToHalfPass pass; + framework::ir::AutoMixedPrecisionPass pass; pass.Set("mixed_precision_mode", new int{static_cast(mixed_precision_)}); pass.Set("mixed_black_list", new std::unordered_set{black_list_}); - pass.Set("enable_gpu_half", new bool{true}); + pass.Set("enable_gpu_mixed", new bool{true}); pass.Set("keep_io_types", new bool{keep_io_types_}); pass.Apply(main_graph_.get()); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index c7554de9df73b2..b4d39e687203e3 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -99,7 +99,7 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, // default } else if (precision_mode == Precision::kHalf || precision_mode == Precision::kBf16) { - enable_gpu_half_ = true; + enable_gpu_mixed_ = true; } else { LOG(ERROR) << "The Paddle-GPU inference currently only supports " @@ -396,7 +396,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { // Mixed precision related. CP_MEMBER(mixed_black_list_); - CP_MEMBER(enable_gpu_half_); + CP_MEMBER(enable_gpu_mixed_); CP_MEMBER(mixed_precision_mode_); CP_MEMBER(enable_memory_optim_); @@ -1022,7 +1022,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << params_file_; ss << use_gpu_; - ss << enable_gpu_half_; + ss << enable_gpu_mixed_; ss << use_external_stream_; ss << exec_stream_; ss << use_fc_padding_; @@ -1239,7 +1239,7 @@ std::string AnalysisConfig::Summary() { os.InsertRow({"use_gpu", use_gpu_ ? "true" : "false"}); if (use_gpu_) { os.InsertRow({"gpu_device_id", std::to_string(gpu_device_id_)}); - os.InsertRow({"enable_gpu_half_", std::to_string(enable_gpu_half_)}); + os.InsertRow({"enable_gpu_mixed_", std::to_string(enable_gpu_mixed_)}); os.InsertRow({"memory_pool_init_size", std::to_string(memory_pool_init_size_mb_) + "MB"}); os.InsertRow( diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index af4d83f55a6ee2..48cef9e95d75c8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1268,10 +1268,10 @@ void AnalysisPredictor::PrepareArgument() { if (!config_.ir_optim()) { argument_.SetEnableIrOptim(false); - if (config_.enable_gpu_half_) { + if (config_.enable_gpu_mixed_) { argument_.SetEnableIrOptim(true); pass_builder->ClearPasses(); - pass_builder->AppendPass("float_to_half_pass"); + pass_builder->AppendPass("auto_mixed_precision_pass"); LOG(INFO) << "This model run in Paddle-GPU mixed precision mode with no ir " "optimization."; @@ -1282,7 +1282,7 @@ void AnalysisPredictor::PrepareArgument() { if (config_.ir_debug_) { pass_builder->TurnOnDebug(); } - if (config_.enable_gpu_half_) { + if (config_.enable_gpu_mixed_) { LOG(INFO) << "This model run in Paddle-GPU mixed precision mode."; } } @@ -1294,7 +1294,7 @@ void AnalysisPredictor::PrepareArgument() { // mixed precison. argument_.SetModelPrecision(static_cast(model_precision_)); argument_.SetMixedBlackList(config_.mixed_black_list_); - argument_.SetEnableGPUHalf(config_.enable_gpu_half_); + argument_.SetEnableGPUMixed(config_.enable_gpu_mixed_); argument_.SetMixedPrecisionMode(static_cast( paddle::ConvertPrecision(config_.mixed_precision_mode_))); } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index b4c5a0d293574d..41eea1fb98c319 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -1049,7 +1049,7 @@ struct PD_INFER_DECL AnalysisConfig { bool use_gpu_{false}; int gpu_device_id_{0}; uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. - bool enable_gpu_half_{false}; + bool enable_gpu_mixed_{false}; bool thread_local_stream_{false}; bool use_cudnn_{false}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f7ce5b39ed9015..0cb7191ce7d261 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -245,7 +245,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { #endif // "transpose_flatten_concat_fuse_pass", // "constant_folding_pass", // - "float_to_half_pass", // + "auto_mixed_precision_pass", // }); use_gpu_ = true; From a49aad56962997ed0bdf86c732689c1e89f1a9cd Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 12 Dec 2022 03:46:20 +0000 Subject: [PATCH 4/5] fix --- .../framework/ir/auto_mixed_precision_pass.cc | 12 ++++----- .../inference/analysis/ir_pass_manager.cc | 25 +++++++------------ 2 files changed, 15 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index 48b1de077bf200..bc034301989b0a 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -199,7 +199,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const { skip_pass_ = !enable_gpu_mixed; - low_precision_ = static_cast(Get("low_precision_mode")); + low_precision_ = static_cast(Get("mixed_precision_mode")); black_list_ = Get>("mixed_black_list"); SetDefaultBlacklist(); @@ -238,12 +238,12 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const { void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const { PADDLE_ENFORCE_NOT_NULL(graph, platform::errors::PreconditionNotMet( - "During the auto_low_precision_pass, the graph " + "During the auto_mixed_precision_pass, the graph " "should not be nullptr.")); PADDLE_ENFORCE_EQ(graph->IsMainGraph(), true, platform::errors::PreconditionNotMet( - "During the auto_low_precision_pass, the graph " + "During the auto_mixed_precision_pass, the graph " "should be main graph.")); FusePassBase::Init("auto_mixed_precision", graph); @@ -252,7 +252,7 @@ void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const { VLOG(4) << "Init done"; if (skip_pass_) { - VLOG(3) << "Skip apply auto_low_precision_pass."; + VLOG(3) << "Skip auto_mixed_precision_pass."; return; } @@ -617,7 +617,7 @@ void AutoMixedPrecisionPass::ConvertWeightsData() const { auto* scope = param_scope(); PADDLE_ENFORCE_NOT_NULL(scope, platform::errors::PreconditionNotMet( - "During the auto_low_precision_pass, the scope " + "During the auto_mixed_precision_pass, the scope " "should not be null.")); auto var_names = scope->LocalVarNames(); @@ -741,5 +741,5 @@ void AutoMixedPrecisionPass::InsertCastOp() const { } // namespace framework } // namespace paddle -REGISTER_PASS(auto_low_precision_pass, +REGISTER_PASS(auto_mixed_precision_pass, paddle::framework::ir::AutoMixedPrecisionPass); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index b4c24cf9b8bd77..734c8a60fb86be 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -89,6 +89,15 @@ void IRPassManager::CreatePasses(Argument *argument, argument->tensorrt_tuned_dynamic_shape(); pass->Set("with_dynamic_shape", new bool(with_dynamic_shape)); + // Mixed precision related. + pass->Set( + "mixed_black_list", + new std::unordered_set(argument->mixed_black_list())); + pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed())); + pass->Set("mixed_precision_mode", + new int(argument->mixed_precision_mode())); + pass->Set("model_precision", new int(argument->model_precision())); + if (pass_name == "graph_viz_pass") { std::string optim_cache_dir = argument->optim_cache_dir(); std::string dot_file_path; @@ -208,12 +217,6 @@ void IRPassManager::CreatePasses(Argument *argument, // not run fp16. pass->Set("disable_trt_plugin_fp16", new bool(argument->disable_trt_plugin_fp16())); - - // Mixed precision related. - pass->Set("model_precision", new int(argument->model_precision())); - pass->Set( - "mixed_black_list", - new std::unordered_set(argument->mixed_black_list())); } else if (pass_name == "dlnne_subgraph_pass") { auto precision_mode = argument->dlnne_precision_mode(); pass->Set("min_subgraph_size", @@ -296,16 +299,6 @@ void IRPassManager::CreatePasses(Argument *argument, } bool use_fc_padding = !fc_mkldnn_pass && argument->use_fc_padding(); pass->Set("use_fc_padding", new bool(use_fc_padding)); - } else if (pass_name == "auto_mixed_precision_pass") { - pass->Set( - "mixed_black_list", - new std::unordered_set(argument->mixed_black_list())); - pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed())); - pass->Set("mixed_precision_mode", - new int(argument->mixed_precision_mode())); - } else if (pass_name == "conv_elementwise_add_act_fuse_pass" || - pass_name == "conv2d_fusion_layout_transfer_pass") { - pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed())); } pre_pass = pass_name; From 18bd36e2970d8086e0d2b1b574c7935218f175ac Mon Sep 17 00:00:00 2001 From: yuanlehome Date: Mon, 12 Dec 2022 09:46:24 +0000 Subject: [PATCH 5/5] merge --- .gitignore | 4 +- .../distributed/collective/CMakeLists.txt | 2 +- .../fluid/distributed/collective/NCCLTools.cc | 2 +- .../fluid/distributed/collective/NCCLTools.h | 37 +-- .../collective/ProcessGroupNCCL.cc | 200 +++++++----- .../distributed/collective/ProcessGroupNCCL.h | 4 +- paddle/fluid/distributed/collective/check.cc | 290 ++++++++++++++++++ .../collective/{static_check.h => check.h} | 46 ++- .../distributed/collective/static_check.cc | 155 ---------- .../tests/task_tests/nan_inf_utils_test.cc | 46 ++- .../framework/details/nan_inf_utils_detail.cc | 270 ++++++---------- .../framework/details/nan_inf_utils_detail.cu | 239 ++++++++------- .../framework/details/nan_inf_utils_detail.h | 119 ++++++- .../interpreter/dependency_builder.cc | 14 +- .../interpreter/dependency_builder.h | 2 +- .../interpreter/stream_analyzer.cc | 29 +- .../interpreter/stream_analyzer.h | 3 +- .../framework/new_executor/interpretercore.cc | 21 +- .../new_executor/new_executor_defs.h | 4 +- .../ir_passes/tensorrt_subgraph_pass.cc | 53 +++- paddle/fluid/inference/api/analysis_config.cc | 5 - .../inference/api/paddle_pass_builder.cc | 1 + paddle/fluid/inference/tensorrt/engine.cc | 2 +- paddle/fluid/inference/tensorrt/engine.h | 6 +- paddle/fluid/inference/tensorrt/op_teller.cc | 14 +- paddle/fluid/operators/batch_norm_op.cu | 4 +- paddle/fluid/operators/batch_norm_op.h | 2 +- .../operators/fused/fused_bn_activation_op.cu | 6 +- .../fused/fused_bn_add_activation_op.cu | 6 +- .../fluid/operators/generator/CMakeLists.txt | 129 +++----- .../operators/generator/generate_static_op.py | 153 +++++++++ paddle/fluid/operators/log_loss_op.cc | 129 -------- .../operators/mkldnn/reshape_mkldnn_op.cc | 22 -- paddle/fluid/operators/norm_utils.h | 51 --- paddle/fluid/operators/put_along_axis_op.cc | 111 ------- paddle/fluid/operators/searchsorted_op.cc | 72 ----- paddle/fluid/operators/svd_op.cc | 126 -------- .../fluid/operators/sync_batch_norm_op_mlu.cc | 4 +- .../fluid/operators/sync_batch_norm_op_npu.cc | 4 +- paddle/fluid/operators/take_along_axis_op.cc | 106 ------- .../operators/tensorrt/tensorrt_engine_op.h | 39 ++- .../tensorrt/tensorrt_engine_op_test.cc | 5 +- paddle/fluid/platform/dynload/cusolver.h | 23 +- paddle/phi/api/yaml/backward.yaml | 41 +++ paddle/phi/api/yaml/legacy_backward.yaml | 42 --- paddle/phi/api/yaml/legacy_ops.yaml | 50 --- paddle/phi/api/yaml/op_compat.yaml | 38 +++ paddle/phi/api/yaml/ops.yaml | 58 +++- paddle/phi/api/yaml/static_ops.yaml | 7 + paddle/phi/backends/dynload/cusolver.h | 23 +- paddle/phi/core/tensor_utils.cc | 43 ++- .../kernels/funcs}/norm_utils.cu.h | 20 +- .../kernels/funcs/values_vectors_functor.h | 179 ++++++++++- .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 37 ++- paddle/phi/kernels/gpu/batch_norm_kernel.cu | 1 - paddle/phi/kernels/gpu/stack_kernel.cu | 164 +++++++--- paddle/phi/kernels/onednn/reshape_kernel.cc | 179 +++++++++++ .../kernels/onednn/transpose_grad_kernel.cc | 8 +- paddle/phi/kernels/onednn/transpose_kernel.cc | 3 +- paddle/phi/ops/compat/log_loss_sig.cc | 29 -- paddle/phi/ops/compat/put_along_axis_sig.cc | 38 --- paddle/phi/ops/compat/svd_sig.cc | 27 -- paddle/phi/ops/compat/take_along_axis_sig.cc | 37 --- paddle/scripts/paddle_build.sh | 27 +- .../sharding/group_sharded_utils.py | 6 +- python/paddle/fluid/dygraph/amp/auto_cast.py | 52 +++- python/paddle/fluid/dygraph/checkpoint.py | 3 +- .../fluid/dygraph/learning_rate_scheduler.py | 15 +- python/paddle/fluid/dygraph/nn.py | 182 ----------- python/paddle/fluid/dygraph/parallel.py | 10 +- python/paddle/fluid/framework.py | 9 +- python/paddle/fluid/optimizer.py | 5 +- .../fleet/dygraph_save_for_auto_infer.py | 3 +- ...parallel_dygraph_control_flow_different.py | 9 +- .../fleet/parallel_dygraph_transformer.py | 21 +- .../test_imperative_auto_mixed_precision.py | 2 +- ...perative_auto_mixed_precision_for_eager.py | 2 +- .../dygraph_to_static/bert_dygraph_model.py | 28 +- .../seq2seq_dygraph_model.py | 26 +- .../dygraph_to_static/simnet_dygraph_model.py | 11 +- .../simnet_dygraph_model_v2.py | 9 +- .../unittests/dygraph_to_static/test_lac.py | 10 +- .../dygraph_to_static/test_ptb_lm.py | 11 +- .../dygraph_to_static/test_ptb_lm_v2.py | 10 +- .../dygraph_to_static/test_sentiment.py | 31 +- .../dygraph_to_static/test_word2vec.py | 14 +- .../transformer_dygraph_model.py | 24 +- .../unittests/ir/inference/test_trt_int64.py | 239 +++++++++++++++ .../parallel_dygraph_sparse_embedding.py | 10 +- .../standalone_executor/CMakeLists.txt | 53 +--- .../test_standalone_cross_step_overlap.py | 82 +++++ .../unittests/test_imperative_auto_prune.py | 9 +- .../test_imperative_load_static_param.py | 6 +- ..._imperative_lod_tensor_to_selected_rows.py | 12 +- .../test_imperative_named_members.py | 2 +- .../test_imperative_ocr_attention_model.py | 6 +- .../unittests/test_imperative_ptb_rnn.py | 10 +- .../unittests/test_imperative_save_load.py | 14 +- .../unittests/test_imperative_save_load_v2.py | 16 +- .../test_imperative_selected_rows.py | 10 +- ..._imperative_selected_rows_to_lod_tensor.py | 11 +- ..._imperative_transformer_sorted_gradient.py | 21 +- .../fluid/tests/unittests/test_layers.py | 36 ++- .../fluid/tests/unittests/test_nan_inf.py | 103 ++++++- .../tests/unittests/test_rnn_decode_api.py | 4 +- python/paddle/static/__init__.py | 4 +- python/paddle/static/nn/metric.py | 2 +- 107 files changed, 2573 insertions(+), 2181 deletions(-) create mode 100644 paddle/fluid/distributed/collective/check.cc rename paddle/fluid/distributed/collective/{static_check.h => check.h} (65%) delete mode 100644 paddle/fluid/distributed/collective/static_check.cc create mode 100644 paddle/fluid/operators/generator/generate_static_op.py delete mode 100644 paddle/fluid/operators/log_loss_op.cc delete mode 100644 paddle/fluid/operators/norm_utils.h delete mode 100644 paddle/fluid/operators/put_along_axis_op.cc delete mode 100644 paddle/fluid/operators/searchsorted_op.cc delete mode 100644 paddle/fluid/operators/svd_op.cc delete mode 100644 paddle/fluid/operators/take_along_axis_op.cc create mode 100644 paddle/phi/api/yaml/static_ops.yaml rename paddle/{fluid/operators => phi/kernels/funcs}/norm_utils.cu.h (98%) create mode 100644 paddle/phi/kernels/onednn/reshape_kernel.cc delete mode 100644 paddle/phi/ops/compat/log_loss_sig.cc delete mode 100644 paddle/phi/ops/compat/put_along_axis_sig.cc delete mode 100644 paddle/phi/ops/compat/svd_sig.cc delete mode 100644 paddle/phi/ops/compat/take_along_axis_sig.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py create mode 100644 python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py diff --git a/.gitignore b/.gitignore index f1d02f4dd25c69..890aca6fa96ad1 100644 --- a/.gitignore +++ b/.gitignore @@ -73,8 +73,8 @@ tools/nvcc_lazy # these files (directories) are generated before build system generation paddle/fluid/operators/generated_op.cc paddle/fluid/operators/generated_sparse_op.cc -paddle/phi/ops/compat/generated_sig.cc -paddle/phi/ops/compat/generated_sparse_sig.cc +paddle/fluid/operators/generated_static_op.cc +paddle/phi/ops/compat/generated_*.cc paddle/phi/api/yaml/parsed_apis/ paddle/fluid/operators/generator/parsed_ops/ paddle/fluid/pybind/tmp_eager_op_function_impl.h diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 83b42fd4320706..85efa52c3196a7 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -21,7 +21,7 @@ endif() if(WITH_NCCL OR WITH_RCCL) cc_library( processgroup_nccl - SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc static_check.cc + SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc DEPS processgroup processgroup_stream place diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc index a8c437bb12225d..47c0f547ee79ea 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.cc +++ b/paddle/fluid/distributed/collective/NCCLTools.cc @@ -14,7 +14,7 @@ #include "paddle/fluid/distributed/collective/NCCLTools.h" -#include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h index 37b1e0f114c3d4..103e56a99d50b1 100644 --- a/paddle/fluid/distributed/collective/NCCLTools.h +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -21,42 +21,29 @@ #include #endif -#include - #include #include "paddle/fluid/distributed/collective/Types.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/variable.h" - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include "paddle/fluid/platform/cuda_device_guard.h" -#endif - -#include "paddle/fluid/platform/device_context.h" #ifdef PADDLE_WITH_RCCL -#include "paddle/fluid/platform/dynload/rccl.h" +#include "paddle/phi/backends/dynload/rccl.h" #else -#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/phi/backends/dynload/nccl.h" #endif -#include "paddle/fluid/platform/enforce.h" -#include "paddle/utils/variant.h" - namespace paddle { namespace distributed { -#define NCCL_CHECK(cmd) \ - do { \ - ncclResult_t r = cmd; \ - if (r != ncclSuccess) { \ - printf("Failed, NCCL error %s:%d '%s'\n", \ - __FILE__, \ - __LINE__, \ - platform::dynload::ncclGetErrorString(r)); \ - exit(EXIT_FAILURE); \ - } \ +#define NCCL_CHECK(cmd) \ + do { \ + ncclResult_t r = cmd; \ + if (r != ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", \ + __FILE__, \ + __LINE__, \ + phi::dynload::ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) ncclRedOp_t ToNCCLRedType(ReduceOp reduction); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc index b5c44962dd3a52..13de2625a6eeea 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -16,7 +16,7 @@ #include "paddle/fluid/distributed/collective/Common.h" #include "paddle/fluid/distributed/collective/NCCLTools.h" -#include "paddle/fluid/distributed/collective/static_check.h" +#include "paddle/fluid/distributed/collective/check.h" #include "paddle/fluid/distributed/collective/utils.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/platform/place.h" @@ -25,6 +25,8 @@ DECLARE_bool(nccl_blocking_wait); DECLARE_bool(use_stream_safe_cuda_allocator); +// set this flag to `true` and recompile to enable dynamic checks +constexpr bool FLAGS_enable_nccl_dynamic_check = false; constexpr int64_t kWaitBlockTImeout = 10; namespace paddle { @@ -89,12 +91,10 @@ ProcessGroupNCCL::ProcessGroupNCCL(const std::shared_ptr& store, : ProcessGroupStream(rank, size, gid), store_(store) {} void ProcessGroupNCCL::GroupStart() { - NCCL_CHECK(platform::dynload::ncclGroupStart()); + NCCL_CHECK(phi::dynload::ncclGroupStart()); } -void ProcessGroupNCCL::GroupEnd() { - NCCL_CHECK(platform::dynload::ncclGroupEnd()); -} +void ProcessGroupNCCL::GroupEnd() { NCCL_CHECK(phi::dynload::ncclGroupEnd()); } phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext( const Place& place) const { @@ -146,7 +146,13 @@ std::shared_ptr ProcessGroupNCCL::AllGather( size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { - NCCL_CHECK(platform::dynload::ncclAllGather( + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(*out_tensor, + /*root_rank*/ 0, + rank_, + comm); + } + NCCL_CHECK(phi::dynload::ncclAllGather( in_tensor_maybe_partial.data(), out_tensor->data(), in_tensor_maybe_partial.numel(), @@ -173,7 +179,13 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { - NCCL_CHECK(platform::dynload::ncclAllReduce( + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(*out_tensor, + /*root_rank*/ 0, + rank_, + comm); + } + NCCL_CHECK(phi::dynload::ncclAllReduce( in_tensor.data(), out_tensor->data(), in_tensor.numel(), @@ -219,9 +231,10 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( CheckSizeOnEachRank(out_dim, out_size_each_rank, size_); CheckSizeOnEachRank(in_dim, in_size_each_rank, size_); - // NOTE: Since `all_to_all` needs other processes's participation, it cannot + // NOTE: Since `all_to_all` needs other processes' participation, it cannot // simply be covered by static checks. Factors are set to 0 here to skip the - // shape check. Its shape check will be done by dynamic checks in debug mode. + // shape check. Its shape check will be done by dynamic checks with + // FLAGS_enable_nccl_dynamic_check. CommStaticCheck::CheckShape(*out_tensor, in_tensor, /*dst_rank*/ rank_, @@ -231,6 +244,10 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( /*in_size_factor*/ 0); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape( + *out_tensor, in_tensor, in_size_each_rank, rank_, size_, comm); + } int64_t in_row_size = in_tensor.numel() / in_dim[0], out_row_size = out_tensor->numel() / out_dim[0]; int64_t in_offset = 0, in_numel = 0, out_offset = 0, out_numel = 0; @@ -240,7 +257,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( for (auto i = 0; i < size_; i++) { in_numel = in_size_each_rank[i] * in_row_size; input_partial = GetPartialTensor(in_tensor, in_offset, in_numel); - NCCL_CHECK(platform::dynload::ncclSend( + NCCL_CHECK(phi::dynload::ncclSend( input_partial.data(), in_numel, platform::ToNCCLDataType(input_partial.dtype()), @@ -251,7 +268,7 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( out_numel = out_size_each_rank[i] * out_row_size; output_partial = GetPartialTensor(*out_tensor, out_offset, out_numel); - NCCL_CHECK(platform::dynload::ncclRecv( + NCCL_CHECK(phi::dynload::ncclRecv( output_partial.data(), out_numel, platform::ToNCCLDataType(output_partial.dtype()), @@ -304,7 +321,10 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { int root = opts.source_rank + opts.source_root; - NCCL_CHECK(platform::dynload::ncclBroadcast( + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(*out_tensor, root, rank_, comm); + } + NCCL_CHECK(phi::dynload::ncclBroadcast( in_tensor.data(), out_tensor->data(), in_tensor.numel(), @@ -332,7 +352,13 @@ std::shared_ptr ProcessGroupNCCL::Reduce( size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { - NCCL_CHECK(platform::dynload::ncclReduce( + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(*out_tensor, + /*root_rank*/ opts.root_rank, + rank_, + comm); + } + NCCL_CHECK(phi::dynload::ncclReduce( in_tensor.data(), out_tensor->data(), in_tensor.numel(), @@ -361,7 +387,13 @@ std::shared_ptr ProcessGroupNCCL::ReduceScatter( size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { - NCCL_CHECK(platform::dynload::ncclReduceScatter( + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(*out_tensor, + /*root_rank*/ 0, + rank_, + comm); + } + NCCL_CHECK(phi::dynload::ncclReduceScatter( in_tensor.data(), out_tensor->data(), out_tensor->numel(), @@ -389,6 +421,12 @@ std::shared_ptr ProcessGroupNCCL::Scatter( size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(*out_tensor, + /*root_rank*/ opts.root_rank, + rank_, + comm); + } int64_t numel = in_tensor.numel() / size_; if (rank_ == opts.root_rank) { int64_t offset = 0; @@ -396,7 +434,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( GroupStart(); for (auto i = 0; i < size_; i++) { partial_tensor = GetPartialTensor(in_tensor, offset, numel); - NCCL_CHECK(platform::dynload::ncclSend( + NCCL_CHECK(phi::dynload::ncclSend( partial_tensor.data(), numel, platform::ToNCCLDataType(partial_tensor.dtype()), @@ -405,7 +443,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( stream)); offset += numel; } - NCCL_CHECK(platform::dynload::ncclRecv( + NCCL_CHECK(phi::dynload::ncclRecv( out_tensor->data(), numel, platform::ToNCCLDataType(out_tensor->dtype()), @@ -414,7 +452,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( stream)); GroupEnd(); } else { - NCCL_CHECK(platform::dynload::ncclRecv( + NCCL_CHECK(phi::dynload::ncclRecv( out_tensor->data(), numel, platform::ToNCCLDataType(out_tensor->dtype()), @@ -443,16 +481,22 @@ std::shared_ptr ProcessGroupNCCL::Recv( tensor = &partial_tensor; } - CommStaticCheck::SingleTensor(*tensor, rank_, size_); + CommStaticCheck::CheckShape(*tensor, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { - NCCL_CHECK(platform::dynload::ncclRecv( - tensor->data(), - tensor->numel(), - platform::ToNCCLDataType(tensor->dtype()), - src_rank, - comm, - stream)); + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(*tensor, + /*root_rank*/ src_rank, + rank_, + comm); + } + NCCL_CHECK( + phi::dynload::ncclRecv(tensor->data(), + tensor->numel(), + platform::ToNCCLDataType(tensor->dtype()), + src_rank, + comm, + stream)); }, *tensor, CommType::RECV, @@ -471,10 +515,16 @@ std::shared_ptr ProcessGroupNCCL::Send( const phi::DenseTensor& tensor_maybe_partial = numel > 0 ? GetPartialTensor(tensor, offset, numel) : tensor; - CommStaticCheck::SingleTensor(tensor_maybe_partial, rank_, size_); + CommStaticCheck::CheckShape(tensor_maybe_partial, rank_, size_); return RunFnInNCCLEnv( [&](ncclComm_t comm, gpuStream_t stream) { - NCCL_CHECK(platform::dynload::ncclSend( + if (FLAGS_enable_nccl_dynamic_check) { + CommDynamicCheck::CheckShape(tensor_maybe_partial, + /*root_rank*/ rank_, + rank_, + comm); + } + NCCL_CHECK(phi::dynload::ncclSend( tensor_maybe_partial.data(), tensor_maybe_partial.numel(), platform::ToNCCLDataType(tensor_maybe_partial.dtype()), @@ -520,7 +570,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, ncclUniqueId nccl_id; if (rank_ == 0) { - NCCL_CHECK(platform::dynload::ncclGetUniqueId(&nccl_id)); + NCCL_CHECK(phi::dynload::ncclGetUniqueId(&nccl_id)); } BroadcastUniqueNCCLID(&nccl_id); @@ -532,7 +582,7 @@ void ProcessGroupNCCL::CreateNCCLEnvCache(const Place& place, platform::DeviceContextPool::Instance().Get(place)); auto comm_ctx = std::make_unique(place); ncclComm_t nccl_comm; - NCCL_CHECK(platform::dynload::ncclCommInitRank( + NCCL_CHECK(phi::dynload::ncclCommInitRank( &nccl_comm, GetSize(), nccl_id, GetRank())); comm_ctx->set_nccl_comm(nccl_comm); @@ -589,6 +639,10 @@ std::shared_ptr ProcessGroupNCCL::RunFnInNCCLEnv( task->UpdateWaitChain(*comm_ctx); } + if (FLAGS_enable_nccl_dynamic_check) { + task->SetBlockCPUInWait(); + task->Wait(); + } return task; } @@ -633,7 +687,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( ncclUniqueId nccl_id; if (rank_ == 0) { - NCCL_CHECK(platform::dynload::ncclGetUniqueId(&nccl_id)); + NCCL_CHECK(phi::dynload::ncclGetUniqueId(&nccl_id)); } BroadcastUniqueNCCLID(&nccl_id); @@ -654,7 +708,7 @@ void ProcessGroupNCCL::CreateNCCLManagerCache( dev_ctx[i].reset(new phi::GPUContext(places[i])); ncclComm_t nccl_comm; - NCCL_CHECK(platform::dynload::ncclCommInitRank( + NCCL_CHECK(phi::dynload::ncclCommInitRank( &nccl_comm, GetSize(), nccl_id, GetRank())); dev_ctx[i]->set_nccl_comm(nccl_comm); dev_ctx_raw[i] = dev_ctx[i].get(); @@ -791,7 +845,7 @@ std::shared_ptr ProcessGroupNCCL::AllReduce( phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream) { - return platform::dynload::ncclAllReduce( + return phi::dynload::ncclAllReduce( input.data(), output.data(), input.numel(), @@ -821,7 +875,7 @@ std::shared_ptr ProcessGroupNCCL::Broadcast( const gpuStream_t& stream) { const auto root = opts.source_rank * in_tensors.size() + opts.source_root; - return platform::dynload::ncclBroadcast( + return phi::dynload::ncclBroadcast( input.data(), output.data(), input.numel(), @@ -871,13 +925,12 @@ std::shared_ptr ProcessGroupNCCL::Send( ncclComm_t comm, const gpuStream_t& stream, int dst_rank) { - return platform::dynload::ncclSend( - input.data(), - input.numel(), - platform::ToNCCLDataType(input.dtype()), - dst_rank, - comm, - stream); + return phi::dynload::ncclSend(input.data(), + input.numel(), + platform::ToNCCLDataType(input.dtype()), + dst_rank, + comm, + stream); }, dst_rank, CommType::SEND); @@ -894,13 +947,12 @@ std::shared_ptr ProcessGroupNCCL::Recv( ncclComm_t comm, const gpuStream_t& stream, int src_rank) { - return platform::dynload::ncclRecv( - output.data(), - output.numel(), - platform::ToNCCLDataType(output.dtype()), - src_rank, - comm, - stream); + return phi::dynload::ncclRecv(output.data(), + output.numel(), + platform::ToNCCLDataType(output.dtype()), + src_rank, + comm, + stream); }, src_rank, CommType::RECV); @@ -925,7 +977,7 @@ std::shared_ptr ProcessGroupNCCL::AllGather( phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream) { - return platform::dynload::ncclAllGather( + return phi::dynload::ncclAllGather( input.data(), output.data(), input.numel(), @@ -994,14 +1046,14 @@ std::shared_ptr ProcessGroupNCCL::AllToAll( size_t offset = 0; GroupStart(); for (auto i = 0; i < size_; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclSend( GetPointerByOffset(input.data(), offset, input.dtype()), input.numel() / size_, platform::ToNCCLDataType(input.dtype()), i, comm, stream)); - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclRecv( GetPointerByOffset(output.data(), offset, input.dtype()), input.numel() / size_, platform::ToNCCLDataType(input.dtype()), @@ -1030,15 +1082,15 @@ std::shared_ptr ProcessGroupNCCL::Reduce( phi::DenseTensor& output, ncclComm_t comm, const gpuStream_t& stream) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce( - input.data(), - output.data(), - input.numel(), - platform::ToNCCLDataType(input.dtype()), - ToNCCLRedType(opts.reduce_op), - opts.root_rank, - comm, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclReduce(input.data(), + output.data(), + input.numel(), + platform::ToNCCLDataType(input.dtype()), + ToNCCLRedType(opts.reduce_op), + opts.root_rank, + comm, + stream)); }, CommType::REDUCE); } @@ -1066,7 +1118,7 @@ std::shared_ptr ProcessGroupNCCL::Scatter( if (rank_ == opts.root_rank) { GroupStart(); for (auto i = 0; i < size_; i++) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::ncclSend( GetPointerByOffset(input.data(), offset, input.dtype()), input.numel() / size_, platform::ToNCCLDataType(input.dtype()), @@ -1075,22 +1127,22 @@ std::shared_ptr ProcessGroupNCCL::Scatter( stream)); offset += input.numel() / size_; } - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output.data(), - input.numel() / size_, - platform::ToNCCLDataType(input.dtype()), - opts.root_rank, - comm, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclRecv(output.data(), + input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), + opts.root_rank, + comm, + stream)); GroupEnd(); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv( - output.data(), - input.numel() / size_, - platform::ToNCCLDataType(input.dtype()), - opts.root_rank, - comm, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclRecv(output.data(), + input.numel() / size_, + platform::ToNCCLDataType(input.dtype()), + opts.root_rank, + comm, + stream)); } }, CommType::SCATTER); diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h index 2a184e182aae9d..3ce77297f56f18 100644 --- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -33,9 +33,9 @@ #endif #ifdef PADDLE_WITH_RCCL -#include "paddle/fluid/platform/dynload/rccl.h" +#include "paddle/phi/backends/dynload/rccl.h" #elif PADDLE_WITH_NCCL -#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/phi/backends/dynload/nccl.h" #endif namespace paddle { diff --git a/paddle/fluid/distributed/collective/check.cc b/paddle/fluid/distributed/collective/check.cc new file mode 100644 index 00000000000000..9a2ca064024f4c --- /dev/null +++ b/paddle/fluid/distributed/collective/check.cc @@ -0,0 +1,290 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/check.h" + +#include "paddle/fluid/distributed/collective/NCCLTools.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/errors.h" + +#ifdef PADDLE_WITH_HIP +#define gpuMalloc hipMalloc +#define gpuMemcpy hipMemcpy +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuFree hipFree +#else +#define gpuMalloc cudaMalloc +#define gpuMemcpy cudaMemcpy +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuFree cudaFree +#endif + +namespace paddle { +namespace distributed { + +// static checks +void CommStaticCheck::CheckRank(int rank, int world_size) { + PADDLE_ENFORCE_GE(rank, + 0, + phi::errors::InvalidArgument( + "Rank should be greater than or equal to 0.")); + PADDLE_ENFORCE_LT( + rank, + world_size, + phi::errors::InvalidArgument("Rank is out of the process group.")); +} + +void CommStaticCheck::CheckPlace(const phi::DenseTensor& tensor) { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(tensor.place()), + true, + platform::errors::InvalidArgument("Tensor should be in GPU place.")); +} + +void CommStaticCheck::CheckPlace(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor) { + CheckPlace(out_tensor); + CheckPlace(in_tensor); + PADDLE_ENFORCE_EQ( + out_tensor.place(), + in_tensor.place(), + phi::errors::InvalidArgument( + "Input and output tensors should be on the same place.")); +} + +void CommStaticCheck::CheckDataType(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor) { + PADDLE_ENFORCE_EQ( + out_tensor.dtype(), + in_tensor.dtype(), + phi::errors::InvalidArgument( + "Input and output tensors should have the same data type.")); +} + +void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor) { + PADDLE_ENFORCE_GT( + tensor.numel(), + 0, + phi::errors::InvalidArgument("Size of tensor should be greater than 0.")); +} + +void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int out_size_factor, + int in_size_factor) { + CheckShape(out_tensor); + CheckShape(in_tensor); + int64_t out_size = out_tensor.numel(), in_size = in_tensor.numel(); + PADDLE_ENFORCE_EQ( + out_size * out_size_factor, + in_size * in_size_factor, + phi::errors::InvalidArgument( + "Input and output tensors should have matching sizes.")); +} + +void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int dst_rank, + int cur_rank, + int world_size, + int out_size_factor, + int in_size_factor) { + CheckRank(dst_rank, world_size); + CheckRank(cur_rank, world_size); + + CheckPlace(out_tensor, in_tensor); + CheckDataType(out_tensor, in_tensor); + + if (dst_rank == cur_rank) { + CheckShape(out_tensor, in_tensor, out_size_factor, in_size_factor); + } else { + CheckShape(out_tensor); + CheckShape(in_tensor); + } +} + +void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor, + int rank, + int world_size) { + CheckPlace(tensor); + CheckRank(rank, world_size); +} + +void CommStaticCheck::SameShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int dst_rank, + int cur_rank, + int world_size) { + CheckShape(out_tensor, + in_tensor, + dst_rank, + cur_rank, + world_size, + /*out_size_factor*/ 1, + /*in_size_factor*/ 1); +} + +void CommStaticCheck::ScatterLikeShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int dst_rank, + int cur_rank, + int world_size) { + CheckShape(out_tensor, + in_tensor, + dst_rank, + cur_rank, + world_size, + /*out_size_factor*/ world_size, + /*in_size_factor*/ 1); +} + +void CommStaticCheck::GatherLikeShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + int dst_rank, + int cur_rank, + int world_size) { + CheckShape(out_tensor, + in_tensor, + dst_rank, + cur_rank, + world_size, + /*out_size_factor*/ 1, + /*in_size_factor*/ world_size); +} + +// dynamic checks +void CommDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, + int64_t dtype) { + PADDLE_ENFORCE_EQ( + static_cast(tensor.dtype()), + dtype, + phi::errors::InvalidArgument( + "Tensors in communication are expected to have the same data type.")); +} + +void CommDynamicCheck::CheckDataType(const phi::DenseTensor& tensor, + int root_rank, + int cur_rank, + ncclComm_t comm) { + constexpr int kSize = sizeof(int64_t); + int64_t dtype_host = static_cast(tensor.dtype()); + int64_t* dtype_device; + PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&dtype_device, kSize)); + PADDLE_ENFORCE_GPU_SUCCESS( + gpuMemcpy(dtype_device, &dtype_host, kSize, gpuMemcpyHostToDevice)); + + NCCL_CHECK(phi::dynload::ncclBroadcast(dtype_device, + dtype_device, + kSize, + ncclInt64, + root_rank, + comm, + kDefaultStream)); + + if (root_rank == cur_rank) { + VLOG(3) << "Dynamic check broadcast metadata, dtype: " << dtype_host; + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + gpuMemcpy(&dtype_host, dtype_device, kSize, gpuMemcpyDeviceToHost)); + VLOG(3) << "Dynamic check recv metadata, dtype: " << dtype_host; + CheckDataType(tensor, dtype_host); + } + PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(dtype_device)); +} + +void CommDynamicCheck::CheckShape(const phi::DenseTensor& tensor, + int64_t shape) { + PADDLE_ENFORCE_EQ( + tensor.numel(), + shape, + phi::errors::InvalidArgument( + "Tensors in communication are expected to have matching sizes.")); +} + +void CommDynamicCheck::CheckShape(const phi::DenseTensor& tensor, + int root_rank, + int cur_rank, + ncclComm_t comm) { + CheckDataType(tensor, root_rank, cur_rank, comm); + + constexpr int kSize = sizeof(int64_t); + int64_t shape_host = tensor.numel(); + int64_t* shape_device; + + PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&shape_device, kSize)); + PADDLE_ENFORCE_GPU_SUCCESS( + gpuMemcpy(shape_device, &shape_host, kSize, gpuMemcpyHostToDevice)); + + NCCL_CHECK(phi::dynload::ncclBroadcast(shape_device, + shape_device, + kSize, + ncclInt64, + root_rank, + comm, + kDefaultStream)); + + if (root_rank == cur_rank) { + VLOG(3) << "Dynamic check broadcast metadata, shape: " << shape_host; + } else { + PADDLE_ENFORCE_GPU_SUCCESS( + gpuMemcpy(&shape_host, shape_device, kSize, gpuMemcpyDeviceToHost)); + VLOG(3) << "Dynamic check recv metadata, shape: " << shape_host; + CheckShape(tensor, shape_host); + } + PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(shape_device)); +} + +void CommDynamicCheck::CheckShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& in_size_each_rank, + int cur_rank, + int world_size, + ncclComm_t comm) { + CheckDataType(out_tensor, /*root_rank*/ 0, cur_rank, comm); + CheckDataType(in_tensor, /*root_rank*/ 0, cur_rank, comm); + + constexpr int kSize = sizeof(int64_t); + int64_t in_row_size = in_tensor.numel() / in_tensor.dims()[0]; + + for (int rank = 0; rank < world_size; ++rank) { + int64_t in_shape_host = in_size_each_rank[rank] * in_row_size; + int64_t* in_shape_device; + PADDLE_ENFORCE_GPU_SUCCESS(gpuMalloc(&in_shape_device, kSize)); + PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy( + in_shape_device, &in_shape_host, kSize, gpuMemcpyHostToDevice)); + + NCCL_CHECK(phi::dynload::ncclReduce(in_shape_device, + in_shape_device, + kSize, + ncclInt64, + ncclSum, + rank, + comm, + kDefaultStream)); + if (rank == cur_rank) { + PADDLE_ENFORCE_GPU_SUCCESS(gpuMemcpy( + &in_shape_host, in_shape_device, kSize, gpuMemcpyDeviceToHost)); + VLOG(3) << "Dynamic check recv metadata, shape: " << in_shape_host; + CheckShape(out_tensor, in_shape_host); + } + PADDLE_ENFORCE_GPU_SUCCESS(gpuFree(in_shape_device)); + } +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/static_check.h b/paddle/fluid/distributed/collective/check.h similarity index 65% rename from paddle/fluid/distributed/collective/static_check.h rename to paddle/fluid/distributed/collective/check.h index 5dcb17e505438c..be9bfb5f78f393 100644 --- a/paddle/fluid/distributed/collective/static_check.h +++ b/paddle/fluid/distributed/collective/check.h @@ -14,7 +14,18 @@ #pragma once -// forward declaration to reduce deps +#include +#include + +#include "paddle/phi/backends/gpu/forwards.h" + +#ifdef PADDLE_WITH_HIP +using gpuStream_t = hipStream_t; +#else +using gpuStream_t = cudaStream_t; +#endif + +// forward declarations namespace phi { class DenseTensor; } @@ -49,9 +60,9 @@ struct CommStaticCheck { int in_size_factor); // for p2p - static void SingleTensor(const phi::DenseTensor& tensor, - int rank, - int world_size); + static void CheckShape(const phi::DenseTensor& tensor, + int rank, + int world_size); // for collective static void SameShape(const phi::DenseTensor& out_tensor, @@ -73,5 +84,32 @@ struct CommStaticCheck { int world_size); }; +struct CommDynamicCheck { + static void CheckDataType(const phi::DenseTensor& tensor, int64_t dtype); + + static void CheckDataType(const phi::DenseTensor& tensor, + int root_rank, + int cur_rank, + ncclComm_t comm); + + static void CheckShape(const phi::DenseTensor& tensor, int64_t shape); + + static void CheckShape(const phi::DenseTensor& tensor, + int root_rank, + int cur_rank, + ncclComm_t comm); + + static void CheckShape(const phi::DenseTensor& out_tensor, + const phi::DenseTensor& in_tensor, + const std::vector& in_size_each_rank, + int cur_rank, + int world_size, + ncclComm_t comm); + + private: + // `0` represents default stream for both cuda & hip + static constexpr gpuStream_t kDefaultStream = 0; +}; + } // namespace distributed } // namespace paddle diff --git a/paddle/fluid/distributed/collective/static_check.cc b/paddle/fluid/distributed/collective/static_check.cc deleted file mode 100644 index 98336db90d1e29..00000000000000 --- a/paddle/fluid/distributed/collective/static_check.cc +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/distributed/collective/static_check.h" - -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/place.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/errors.h" - -namespace paddle { -namespace distributed { - -void CommStaticCheck::CheckRank(int rank, int world_size) { - PADDLE_ENFORCE_GE(rank, - 0, - phi::errors::InvalidArgument( - "Rank should be greater than or equal to 0.")); - PADDLE_ENFORCE_LT( - rank, - world_size, - phi::errors::InvalidArgument("Rank is out of the process group.")); -} - -void CommStaticCheck::CheckPlace(const phi::DenseTensor& tensor) { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(tensor.place()), - true, - platform::errors::InvalidArgument("Tensor should be in GPU place.")); -} - -void CommStaticCheck::CheckPlace(const phi::DenseTensor& out_tensor, - const phi::DenseTensor& in_tensor) { - CheckPlace(out_tensor); - CheckPlace(in_tensor); - PADDLE_ENFORCE_EQ( - out_tensor.place(), - in_tensor.place(), - phi::errors::InvalidArgument( - "Input and output tensors should be on the same place.")); -} - -void CommStaticCheck::CheckDataType(const phi::DenseTensor& out_tensor, - const phi::DenseTensor& in_tensor) { - PADDLE_ENFORCE_EQ( - out_tensor.dtype(), - in_tensor.dtype(), - phi::errors::InvalidArgument( - "Input and output tensors should have the same data type.")); -} - -void CommStaticCheck::CheckShape(const phi::DenseTensor& tensor) { - PADDLE_ENFORCE_GT( - tensor.numel(), - 0, - phi::errors::InvalidArgument("Size of tensor should be greater than 0.")); -} - -void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor, - const phi::DenseTensor& in_tensor, - int out_size_factor, - int in_size_factor) { - CheckShape(out_tensor); - CheckShape(in_tensor); - int64_t out_size = out_tensor.numel(), in_size = in_tensor.numel(); - PADDLE_ENFORCE_EQ( - out_size * out_size_factor, - in_size * in_size_factor, - phi::errors::InvalidArgument( - "Input and output tensors should have matching sizes.")); -} - -void CommStaticCheck::CheckShape(const phi::DenseTensor& out_tensor, - const phi::DenseTensor& in_tensor, - int dst_rank, - int cur_rank, - int world_size, - int out_size_factor, - int in_size_factor) { - CheckRank(dst_rank, world_size); - CheckRank(cur_rank, world_size); - - CheckPlace(out_tensor, in_tensor); - CheckDataType(out_tensor, in_tensor); - - if (dst_rank == cur_rank) { - CheckShape(out_tensor, in_tensor, out_size_factor, in_size_factor); - } else { - CheckShape(out_tensor); - CheckShape(in_tensor); - } -} - -void CommStaticCheck::SingleTensor(const phi::DenseTensor& tensor, - int rank, - int world_size) { - CheckPlace(tensor); - CheckRank(rank, world_size); -} - -void CommStaticCheck::SameShape(const phi::DenseTensor& out_tensor, - const phi::DenseTensor& in_tensor, - int dst_rank, - int cur_rank, - int world_size) { - CheckShape(out_tensor, - in_tensor, - dst_rank, - cur_rank, - world_size, - /*out_size_factor*/ 1, - /*in_size_factor*/ 1); -} - -void CommStaticCheck::ScatterLikeShape(const phi::DenseTensor& out_tensor, - const phi::DenseTensor& in_tensor, - int dst_rank, - int cur_rank, - int world_size) { - CheckShape(out_tensor, - in_tensor, - dst_rank, - cur_rank, - world_size, - /*out_size_factor*/ world_size, - /*in_size_factor*/ 1); -} - -void CommStaticCheck::GatherLikeShape(const phi::DenseTensor& out_tensor, - const phi::DenseTensor& in_tensor, - int dst_rank, - int cur_rank, - int world_size) { - CheckShape(out_tensor, - in_tensor, - dst_rank, - cur_rank, - world_size, - /*out_size_factor*/ 1, - /*in_size_factor*/ world_size); -} - -} // namespace distributed -} // namespace paddle diff --git a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc index 73d213f71148f7..86f863bdffa6d5 100644 --- a/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc +++ b/paddle/fluid/eager/tests/task_tests/nan_inf_utils_test.cc @@ -30,32 +30,30 @@ PD_DECLARE_KERNEL(strings_empty, CPU, ALL_LAYOUT); namespace egr { -#define CHECK_NAN_INF(tensors) \ - { \ - bool caught_exception = false; \ - try { \ - CheckTensorHasNanOrInf("nan_inf_test", tensors); \ - } catch (paddle::platform::EnforceNotMet & error) { \ - caught_exception = true; \ - std::string ex_msg = error.what(); \ - EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \ - std::string::npos); \ - } \ - EXPECT_TRUE(caught_exception); \ +#define CHECK_NAN_INF(tensors) \ + { \ + bool caught_exception = false; \ + try { \ + CheckTensorHasNanOrInf("nan_inf_test", tensors); \ + } catch (paddle::platform::EnforceNotMet & error) { \ + caught_exception = true; \ + std::string ex_msg = error.what(); \ + EXPECT_TRUE(ex_msg.find("There are NAN or INF") != std::string::npos); \ + } \ + EXPECT_TRUE(caught_exception); \ } -#define CHECK_NO_NAN_INF(tensors) \ - { \ - bool caught_exception = false; \ - try { \ - CheckTensorHasNanOrInf("nan_inf_test", tensors); \ - } catch (paddle::platform::EnforceNotMet & error) { \ - caught_exception = true; \ - std::string ex_msg = error.what(); \ - EXPECT_TRUE(ex_msg.find("There are `nan` or `inf` in tensor") != \ - std::string::npos); \ - } \ - EXPECT_FALSE(caught_exception); \ +#define CHECK_NO_NAN_INF(tensors) \ + { \ + bool caught_exception = false; \ + try { \ + CheckTensorHasNanOrInf("nan_inf_test", tensors); \ + } catch (paddle::platform::EnforceNotMet & error) { \ + caught_exception = true; \ + std::string ex_msg = error.what(); \ + EXPECT_TRUE(ex_msg.find("There are NAN or INF") != std::string::npos); \ + } \ + EXPECT_FALSE(caught_exception); \ } TEST(NanInfUtils, Functions) { diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index f80bb94b30b648..30046b2d1d44e8 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/phi/common/amp_type_traits.h" #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/device/npu/npu_op_runner.h" @@ -24,6 +25,8 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/phi/kernels/funcs/eigen/extensions.h" +DECLARE_int32(check_nan_inf_level); + namespace paddle { namespace framework { namespace details { @@ -90,7 +93,7 @@ static void InitWhiteListFormEnv() { const char* op_role_skip = std::getenv("PADDLE_INF_NAN_SKIP_ROLE"); const char* op_var_skip = std::getenv("PADDLE_INF_NAN_SKIP_VAR"); - if (op_type_skip != NULL) { + if (op_type_skip) { std::stringstream ss(op_type_skip); std::string op_type; while (std::getline(ss, op_type, ',')) { @@ -98,7 +101,7 @@ static void InitWhiteListFormEnv() { } } - if (op_role_skip != NULL) { + if (op_role_skip) { std::stringstream ss(op_role_skip); std::string op_role; while (std::getline(ss, op_role, ',')) { @@ -113,7 +116,7 @@ static void InitWhiteListFormEnv() { } } - if (op_var_skip != NULL) { + if (op_var_skip) { std::stringstream ss(op_var_skip); std::string op_var; while (std::getline(ss, op_var, ',')) { @@ -131,175 +134,101 @@ static void InitWhiteListFormEnv() { } } -template -static void PrintNanInf(const T* value, - const size_t numel, - int print_num, - const std::string& op_type, - const std::string& var_name, - bool abort = true) { - T min_value = std::numeric_limits::max(); - T max_value = std::numeric_limits::min(); - size_t nan_count, inf_count, num_count; - nan_count = inf_count = num_count = 0; - - // CPU print num value - for (size_t i = 0; i < numel; ++i) { - size_t count = 0; - if (std::isnan(value[i])) { - count = nan_count++; - } else if (std::isinf(value[i])) { - count = inf_count++; - } else { - count = num_count++; - min_value = std::min(min_value, value[i]); - max_value = std::max(max_value, value[i]); - } - - if (count < static_cast(print_num)) { - printf("numel:%zu index:%zu value:%f\n", - numel, - i, - static_cast(value[i])); - } - } - printf( - "In cpu, there has %zu,%zu,%zu nan,inf,num. " - "And in num, min_value is %f, max_value is %f\n", - nan_count, - inf_count, - num_count, - static_cast(min_value), - static_cast(max_value)); - if (abort) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "There are `nan` or `inf` in tensor (%s) of operator (%s).", - var_name, - op_type)); - } -} +template < + typename T, + std::enable_if_t>::value && + !std::is_same>::value, + bool> = true> +static void CheckNanInfCpuImpl(const T* value_ptr, + const int64_t numel, + const std::string& cpu_hint_str) { + using MT = typename phi::dtype::template MPTypeTrait::Type; + +#ifdef _OPENMP + // Use maximum 4 threads to collect the nan and inf information. + int num_threads = std::max(omp_get_num_threads(), 1); + num_threads = std::min(num_threads, 4); +#else + int num_threads = 1; +#endif -// openmp 4.0, reduction with fp16 -#if defined _OPENMP && _OPENMP >= 201307 -// more detail see: 180 page of -// https://www.openmp.org/wp-content/uploads/OpenMP4.0.0.pdf -#pragma omp declare reduction(+ : paddle::platform::float16 : omp_out += omp_in) -#pragma omp declare reduction(+ : paddle::platform::bfloat16 : omp_out += \ - omp_in) -#pragma omp declare reduction(+ : paddle::platform::complex < \ - float > : omp_out += omp_in) -#pragma omp declare reduction(+ : paddle::platform::complex < \ - double > : omp_out += omp_in) + std::vector thread_num_nan(num_threads, 0); + std::vector thread_num_inf(num_threads, 0); + std::vector thread_min_value(num_threads, static_cast(value_ptr[0])); + std::vector thread_max_value(num_threads, static_cast(value_ptr[0])); + std::vector thread_mean_value(num_threads, static_cast(0)); +#ifdef _OPENMP +#pragma omp parallel num_threads(num_threads) #endif - -template -static void CheckNanInf(const T* value, - const size_t numel, - int print_num, - const std::string& op_type, - const std::string& var_name) { - T sum = static_cast(0.0); -#if defined _OPENMP && _OPENMP >= 201307 -#pragma omp parallel for simd reduction(+ : sum) -#elif defined _OPENMP -#pragma omp parallel for reduction(+ : sum) + { +#ifdef _OPENMP + int64_t tid = omp_get_thread_num(); + int64_t chunk_size = (numel + num_threads - 1) / num_threads; + int64_t begin = tid * chunk_size; + int64_t end = chunk_size + begin > numel ? numel : chunk_size + begin; +#else + int64_t tid = 0; + int64_t begin = 0; + int64_t end = numel; #endif - for (size_t i = 0; i < numel; ++i) { - sum += (value[i] - value[i]); - } + for (int64_t i = begin; i < end; ++i) { + MT value = static_cast(value_ptr[i]); - if (std::isnan(sum) || std::isinf(sum)) { - PrintNanInf(value, numel, print_num, op_type, var_name); - } -} + thread_min_value[tid] = std::min(thread_min_value[tid], value); + thread_max_value[tid] = std::max(thread_max_value[tid], value); + thread_mean_value[tid] += value / static_cast(numel); -#if defined _OPENMP && _OPENMP >= 201307 -// openmp4.0 not need to specialization fp16 -#elif defined _OPENMP -template <> -void CheckNanInf( - const paddle::platform::float16* value, - const size_t numel, - int print_num, - const std::string& op_type, - const std::string& var_name) { - float sum = 0.0f; -#pragma omp parallel for reduction(+ : sum) - for (size_t i = 0; i < numel; ++i) { - sum += static_cast(value[i] - value[i]); - } - - if (std::isnan(sum) || std::isinf(sum)) { - PrintNanInf(value, numel, print_num, op_type, var_name); + if (std::isnan(value)) { + thread_num_nan[tid] += 1; + } else if (std::isinf(value)) { + thread_num_inf[tid] += 1; + } + } } -} -template <> -void CheckNanInf( - const paddle::platform::bfloat16* value, - const size_t numel, - int print_num, - const std::string& op_type, - const std::string& var_name) { - float sum = 0.0f; -#pragma omp parallel for reduction(+ : sum) - for (size_t i = 0; i < numel; ++i) { - sum += static_cast(value[i] - value[i]); + int64_t num_nan = 0; + int64_t num_inf = 0; + MT min_value = thread_min_value[0]; + MT max_value = thread_max_value[0]; + MT mean_value = static_cast(0); + for (int i = 0; i < num_threads; ++i) { + num_nan += thread_num_nan[i]; + num_inf += thread_num_inf[i]; + min_value = std::min(thread_min_value[i], min_value); + max_value = std::max(thread_max_value[i], max_value); + mean_value += thread_mean_value[i]; } - if (std::isnan(sum) || std::isinf(sum)) { - PrintNanInf(value, numel, print_num, op_type, var_name); - } + PrintForDifferentLevel(cpu_hint_str.c_str(), + numel, + num_nan, + num_inf, + max_value, + min_value, + mean_value, + FLAGS_check_nan_inf_level); } -template <> -void CheckNanInf>( - const paddle::platform::complex* value, - const size_t numel, - int print_num, - const std::string& op_type, - const std::string& var_name) { - float real_sum = 0.0f; -#pragma omp parallel for reduction(+ : real_sum) - for (size_t i = 0; i < numel; ++i) { - real_sum += (value[i].real - value[i].real); - } - - float imag_sum = 0.0f; -#pragma omp parallel for reduction(+ : imag_sum) - for (size_t i = 0; i < numel; ++i) { - imag_sum += (value[i].imag - value[i].imag); - } - - if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) || - std::isinf(imag_sum)) { - // hot fix for compile failed in gcc4.8 - // here also need print detail info of nan or inf later - PADDLE_THROW(platform::errors::PreconditionNotMet( - "There are `nan` or `inf` in tensor (%s) of operator (%s).", - var_name, - op_type)); - } -} +template < + typename T, + std::enable_if_t>::value || + std::is_same>::value, + bool> = true> +void CheckNanInfCpuImpl(const T* value_ptr, + const int64_t numel, + const std::string& cpu_hint_str) { + using RealType = typename T::value_type; -template <> - void CheckNanInf < paddle::platform::complex < double >>> - (const paddle::platform::complex* value, - const size_t numel, - int print_num, - const std::string& op_type, - const std::string& var_name) { - double real_sum = 0.0; -#pragma omp parallel for reduction(+ : real_sum) - for (size_t i = 0; i < numel; ++i) { - real_sum += (value[i].real - value[i].real); - } + RealType real_sum = 0.0f, imag_sum = 0.0f; - double imag_sum = 0.0; -#pragma omp parallel for reduction(+ : imag_sum) - for (size_t i = 0; i < numel; ++i) { - imag_sum += (value[i].imag - value[i].imag); +#ifdef _OPENMP +#pragma omp parallel for reduction(+ : real_sum) reduction(+ : imag_sum) +#endif + for (int64_t i = 0; i < numel; ++i) { + T value = value_ptr[i]; + real_sum += (value.real - value.real); + imag_sum += (value.imag - value.imag); } if (std::isnan(real_sum) || std::isinf(real_sum) || std::isnan(imag_sum) || @@ -307,14 +236,10 @@ template <> // hot fix for compile failed in gcc4.8 // here also need print detail info of nan or inf later PADDLE_THROW(platform::errors::PreconditionNotMet( - "There are `nan` or `inf` in tensor (%s) of operator (%s).", - var_name, - op_type)); + "There are NAN or INF in %s.", cpu_hint_str)); } } -#endif - template <> template void TensorCheckerVisitor::apply( @@ -323,10 +248,9 @@ void TensorCheckerVisitor::apply( std::is_same>::value || std::is_same>::value>::type*) const { - // use env strategy control in future, -1=print_all. - int print_num = 3; - CheckNanInf( - tensor_.data(), tensor_.numel(), print_num, op_type_, var_name_); + std::string cpu_hint_str = + GetCpuHintString(op_type, var_name, tensor.place()); + CheckNanInfCpuImpl(tensor.data(), tensor.numel(), cpu_hint_str); } template <> @@ -371,8 +295,8 @@ void CheckVarHasNanOrInf(const std::string& op_type, tensor_check(op_type, var_name, *tensor, place); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile with " - "GPU.", + "phi::DenseTensor[%s] use gpu place. PaddlePaddle must compile " + "with GPU.", var_name)); #endif return; @@ -406,8 +330,8 @@ void CheckVarHasNanOrInf(const std::string& op_type, var_name)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile with " - "XPU.", + "phi::DenseTensor[%s] use xpu place. PaddlePaddle must compile " + "with XPU.", var_name)); #endif return; @@ -440,8 +364,8 @@ void CheckVarHasNanOrInf(const std::string& op_type, var_name)); #else PADDLE_THROW(platform::errors::PreconditionNotMet( - "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile with " - "NPU.", + "phi::DenseTensor[%s] use npu place. PaddlePaddle must compile " + "with NPU.", var_name)); #endif return; diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index abf575b4ca5453..629ab737055a47 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -138,6 +138,54 @@ __global__ void CheckNanInfKernel(const T* value, PrintNanInfKernel(value, numel, print_num, debug_info); } +template +__device__ T BlockReduce(T value) { + __shared__ T shared_mem[1024]; + + shared_mem[threadIdx.x] = value; + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride = stride >> 1) { + if (threadIdx.x < stride) { + T value0 = shared_mem[threadIdx.x]; + T value1 = shared_mem[threadIdx.x + stride]; + T reduce_value; + if (ReduceType == 0) { + // max + reduce_value = value0 > value1 ? value0 : value1; + } else if (ReduceType == 1) { + // min + reduce_value = value0 < value1 ? value0 : value1; + } else if (ReduceType == 2) { + // sum + reduce_value = value0 + value1; + } + shared_mem[threadIdx.x] = reduce_value; + } + + if (stride > 16) { + __syncthreads(); + } + } + + __syncthreads(); + return shared_mem[0]; +} + +__device__ void BlockReduceNumNanInfAndWrite(const int64_t num_nan, + const int64_t num_inf, + int64_t offset, + int64_t* num_nan_ptr, + int64_t* num_inf_ptr) { + int64_t block_num_nan = BlockReduce(num_nan); + int64_t block_num_inf = BlockReduce(num_inf); + + if (threadIdx.x == 0) { + num_nan_ptr[offset] = block_num_nan; + num_inf_ptr[offset] = block_num_inf; + } +} + template < typename T, std::enable_if_t>::value || @@ -183,15 +231,16 @@ __device__ void BlockReduceMaxMinAndWrite(const T max_value, template __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, const int64_t numel, - int* found_nan_inf_ptr, + int64_t* block_num_nan_ptr, + int64_t* block_num_inf_ptr, MT* tensor_block_max_ptr, MT* tensor_block_min_ptr, MT* tensor_block_mean_ptr) { - bool has_nan = false; - bool has_inf = false; - int64_t i = threadIdx.x + blockIdx.x * blockDim.x; + int64_t num_nan = 0; + int64_t num_inf = 0; + MT max_value = static_cast(i < numel ? value_ptr[i] : value_ptr[0]); MT min_value = static_cast(i < numel ? value_ptr[i] : value_ptr[0]); MT mean_value = static_cast(0); @@ -203,25 +252,14 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, mean_value += value / static_cast(numel); if (isnan(value)) { - has_nan = true; - } - if (isinf(value)) { - has_inf = true; - } - - if (has_nan || has_inf) { - if (!tensor_block_max_ptr && !tensor_block_min_ptr && - !tensor_block_mean_ptr) { - break; - } + num_nan += 1; + } else if (isinf(value)) { + num_inf += 1; } } - if (has_nan) { - found_nan_inf_ptr[0] = 1; - } - if (has_inf) { - found_nan_inf_ptr[1] = 1; - } + + BlockReduceNumNanInfAndWrite( + num_nan, num_inf, blockIdx.x, block_num_nan_ptr, block_num_inf_ptr); BlockReduceMaxMinAndWrite(max_value, min_value, @@ -232,32 +270,9 @@ __global__ void FindNanInfAndBlockMaxMin(const T* value_ptr, tensor_block_mean_ptr); } -template ::value, bool> = true> -__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) { - if (check_nan_inf_level >= 3) { - return true; - } else if (check_nan_inf_level >= 2) { - MT fp16_max = - static_cast(std::numeric_limits::max()); - return max_value > fp16_max || min_value < -fp16_max; - } - return false; -} - -template ::value, bool> = true> -__device__ bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) { - if (check_nan_inf_level >= 3) { - return true; - } - return false; -} - template -__global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, +__global__ void FindGlobalMaxMinAndPrint(const int64_t* block_num_nan_ptr, + const int64_t* block_num_inf_ptr, const MT* tensor_block_max_ptr, const MT* tensor_block_min_ptr, const MT* tensor_block_mean_ptr, @@ -266,8 +281,14 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, int64_t numel_max_min, int check_nan_inf_level) { if (blockIdx.x == 0 && threadIdx.x == 0) { - int has_nan = found_nan_inf_ptr[0]; - int has_inf = found_nan_inf_ptr[1]; + int64_t num_nan = 0; + int64_t num_inf = 0; + + // numel_max_min <= 128 + for (int64_t i = 0; i < numel_max_min; ++i) { + num_nan += block_num_nan_ptr[i]; + num_inf += block_num_inf_ptr[i]; + } MT max_value = static_cast(0); MT min_value = static_cast(0); @@ -289,67 +310,31 @@ __global__ void FindGlobalMaxMinAndPrint(const int* found_nan_inf_ptr, } } - if (has_nan || has_inf) { - if (check_nan_inf_level == 0) { - PADDLE_ENFORCE(false, - "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " - "find_inf=%d, " - "max=%e, min=%e, mean=%e===\n", - debug_info, - numel, - has_nan, - has_inf, - static_cast(max_value), - static_cast(min_value), - static_cast(mean_value)); - } else if (check_nan_inf_level >= 1) { - printf( - "===[PRECISION] [ERROR] in %s, numel=%ld, find_nan=%d, " - "find_inf=%d, " - "max=%e, min=%e, mean=%e===\n", - debug_info, - numel, - has_nan, - has_inf, - static_cast(max_value), - static_cast(min_value), - static_cast(mean_value)); - } - } else if (NeedPrint(max_value, min_value, check_nan_inf_level)) { - printf("[PRECISION] in %s, numel=%ld, max=%e, min=%e, mean=%e\n", - debug_info, - numel, - static_cast(max_value), - static_cast(min_value), - static_cast(mean_value)); - } + PrintForDifferentLevel(debug_info, + numel, + num_nan, + num_inf, + max_value, + min_value, + mean_value, + check_nan_inf_level); } } -template <> template -void TensorCheckerVisitor::apply( - typename std::enable_if< - std::is_floating_point::value || - std::is_same>::value || - std::is_same>::value>::type*) - const { - auto* dev_ctx = reinterpret_cast( - platform::DeviceContextPool::Instance().Get(tensor_.place())); - int dev_id = tensor_.place().device; +static char* GetGpuHintStringPtr(const phi::GPUContext& ctx, + const std::string& op_type, + const std::string& var_name, + int dev_id) { PADDLE_ENFORCE_EQ( (dev_id >= 0 && dev_id < multi_op_var2gpu_str_mutex().size()), true, platform::errors::OutOfRange("GPU dev_id must >=0 and < dev_count=%d", multi_op_var2gpu_str_mutex().size())); - std::string dtype_str = DataTypeToString(DataTypeTrait::DataType()); - if (dtype_str == "::paddle::platform::float16") { - dtype_str = "float16"; - } - std::string op_var = "[op=" + op_type_ + "] [tensor=" + var_name_ + - "] [dtype=" + dtype_str + "]"; - char* gpu_str_ptr = NULL; + std::string op_var = + GetCpuHintString(op_type, var_name, ctx.GetPlace(), dev_id); + char* gpu_str_ptr = nullptr; { auto& op_var2gpu_str_mutex = multi_op_var2gpu_str_mutex().at(dev_id); @@ -358,9 +343,9 @@ void TensorCheckerVisitor::apply( std::lock_guard guard(op_var2gpu_str_mutex); if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) { // insert auto gpu_str_tensor = paddle::memory::Alloc( - dev_ctx->GetPlace(), + ctx.GetPlace(), op_var.length() + 1, - phi::Stream(reinterpret_cast(dev_ctx->stream()))); + phi::Stream(reinterpret_cast(ctx.stream()))); gpu_str_ptr = reinterpret_cast(gpu_str_tensor->ptr()); op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor)); @@ -378,13 +363,13 @@ void TensorCheckerVisitor::apply( iter->first.c_str(), op_var.length() + 1, hipMemcpyHostToDevice, - dev_ctx->stream())); + ctx.stream())); #else PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(gpu_str_ptr, iter->first.c_str(), op_var.length() + 1, cudaMemcpyHostToDevice, - dev_ctx->stream())); + ctx.stream())); #endif } else { // get auto iter = op_var2gpu_str.find(op_var); @@ -397,6 +382,22 @@ void TensorCheckerVisitor::apply( gpu_str_ptr = reinterpret_cast(iter->second->ptr()); } } + return gpu_str_ptr; +} + +template <> +template +void TensorCheckerVisitor::apply( + typename std::enable_if< + std::is_floating_point::value || + std::is_same>::value || + std::is_same>::value>::type*) + const { + auto* dev_ctx = reinterpret_cast( + platform::DeviceContextPool::Instance().Get(tensor.place())); + int dev_id = tensor.place().device; + char* gpu_str_ptr = + GetGpuHintStringPtr(*dev_ctx, op_type, var_name, dev_id); #ifdef __HIPCC__ // HIP will throw GPU memory access fault if threads > 256 @@ -406,7 +407,7 @@ void TensorCheckerVisitor::apply( #endif size_t blocks = std::min(static_cast(128), - static_cast((tensor_.numel() + threads - 1) / threads)); + static_cast((tensor.numel() + threads - 1) / threads)); #ifdef __HIPCC__ int print_num = 3; @@ -415,44 +416,46 @@ void TensorCheckerVisitor::apply( dim3(threads), 0, dev_ctx->stream(), - tensor_.data(), - tensor_.numel(), + tensor.data(), + tensor.numel(), print_num, gpu_str_ptr); #else using MT = typename phi::dtype::MPTypeTrait::Type; - phi::DenseTensor found_nan_inf; - found_nan_inf.Resize({2}); - int* found_nan_inf_ptr = found_nan_inf.mutable_data(tensor_.place()); - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync( - found_nan_inf_ptr, 0, 2 * sizeof(int), dev_ctx->stream())); - int64_t numel_max_min = blocks; + phi::DenseTensor block_num_nan_inf; + block_num_nan_inf.Resize({static_cast(2 * numel_max_min)}); + int64_t* block_num_nan_ptr = + block_num_nan_inf.mutable_data(tensor.place()); + int64_t* block_num_inf_ptr = block_num_nan_ptr + numel_max_min; + phi::DenseTensor tensor_block_max_min; tensor_block_max_min.Resize({static_cast(3 * numel_max_min)}); MT* tensor_block_max_ptr = - tensor_block_max_min.mutable_data(tensor_.place()); + tensor_block_max_min.mutable_data(tensor.place()); MT* tensor_block_min_ptr = tensor_block_max_ptr + numel_max_min; MT* tensor_block_mean_ptr = tensor_block_max_ptr + 2 * numel_max_min; FindNanInfAndBlockMaxMin - <<stream()>>>(tensor_.data(), - tensor_.numel(), - found_nan_inf_ptr, + <<stream()>>>(tensor.data(), + tensor.numel(), + block_num_nan_ptr, + block_num_inf_ptr, tensor_block_max_ptr, tensor_block_min_ptr, tensor_block_mean_ptr); int check_nan_inf_level = FLAGS_check_nan_inf_level; FindGlobalMaxMinAndPrint - <<<1, 1, 0, dev_ctx->stream()>>>(found_nan_inf_ptr, + <<<1, 1, 0, dev_ctx->stream()>>>(block_num_nan_ptr, + block_num_inf_ptr, tensor_block_max_ptr, tensor_block_min_ptr, tensor_block_mean_ptr, gpu_str_ptr, - tensor_.numel(), + tensor.numel(), numel_max_min, check_nan_inf_level); #endif diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.h b/paddle/fluid/framework/details/nan_inf_utils_detail.h index 2a25bc7b68f366..0adf23fd029218 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.h +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.h @@ -24,21 +24,114 @@ namespace paddle { namespace framework { namespace details { +template ::value, bool> = true> +HOSTDEVICE bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) { + if (check_nan_inf_level >= 3) { + return true; + } else if (check_nan_inf_level >= 2) { + MT fp16_max = + static_cast(std::numeric_limits::max()); + return max_value > fp16_max || min_value < -fp16_max; + } + return false; +} + +template ::value, bool> = true> +HOSTDEVICE bool NeedPrint(MT max_value, MT min_value, int check_nan_inf_level) { + if (check_nan_inf_level >= 3) { + return true; + } + return false; +} + +template +HOSTDEVICE void PrintForDifferentLevel(const char* debug_info, + int64_t numel, + int64_t num_nan, + int64_t num_inf, + MT max_value, + MT min_value, + MT mean_value, + int check_nan_inf_level) { + if (num_nan > 0 || num_inf > 0) { + printf( + "[PRECISION] [ERROR] in %s, numel=%lld, num_nan=%lld, " + "num_inf=%lld, max=%e, min=%e, mean=%e\n", + debug_info, + static_cast(numel), // NOLINT + static_cast(num_nan), // NOLINT + static_cast(num_inf), // NOLINT + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + if (check_nan_inf_level == 0) { +#if defined(__NVCC__) || defined(__HIPCC__) + PADDLE_ENFORCE(false, + "There are NAN or INF (num_nan=%ld, num_inf=%lld) in %s.", + static_cast(num_nan), // NOLINT + static_cast(num_inf), // NOLINT + debug_info); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "There are NAN or INF (num_nan=%lld, num_inf=%lld) in %s.", + static_cast(num_nan), // NOLINT + static_cast(num_inf), // NOLINT + debug_info)); +#endif + } + } else if (NeedPrint(max_value, min_value, check_nan_inf_level)) { + printf("[PRECISION] in %s, numel=%lld, max=%e, min=%e, mean=%e\n", + debug_info, + static_cast(numel), // NOLINT + static_cast(max_value), + static_cast(min_value), + static_cast(mean_value)); + } +} + +template +inline std::string GetCpuHintString(const std::string& op_type, + const std::string& var_name, + const phi::Place& place, + int device_id = -1) { + std::string dtype_str = DataTypeToString(DataTypeTrait::DataType()); + if (dtype_str == "float") { + dtype_str = "fp32"; + } else if (dtype_str == "double") { + dtype_str = "fp64"; + } else if (dtype_str == "::paddle::platform::float16") { + dtype_str = "fp16"; + } else if (dtype_str == "::paddle::platform::bfloat16") { + dtype_str = "bf16"; + } + + std::stringstream ss; + if (platform::is_gpu_place(place)) { + ss << "[device=gpu:" << device_id << ", "; + } else { + ss << "[device=cpu, "; + } + ss << "op=" << op_type << ", tensor=" << var_name << ", dtype=" << dtype_str + << "]"; + return ss.str(); +} + template struct TensorCheckerVisitor { - TensorCheckerVisitor(const std::string& op_type, - const std::string& var_name, - const phi::DenseTensor& tensor, - const platform::Place& place) - : op_type_(op_type), - var_name_(var_name), - tensor_(tensor), - place_(place) {} + TensorCheckerVisitor(const std::string& o, + const std::string& v, + const phi::DenseTensor& t, + const platform::Place& p) + : op_type(o), var_name(v), tensor(t), place(p) {} template void apply( typename std::enable_if::value>::type* = 0) const { - VLOG(10) << var_name_ << " need not to check, it's type is not float point"; + VLOG(10) << var_name << " need not to check, it's type is not float point"; } template @@ -49,10 +142,10 @@ struct TensorCheckerVisitor { std::is_same>::value>::type* = 0) const; - std::string op_type_; - std::string var_name_; - const phi::DenseTensor& tensor_; - const platform::Place& place_; + std::string op_type; + std::string var_name; + const phi::DenseTensor& tensor; + const platform::Place& place; }; template diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index d5207398adca96..7c18f9288c5e70 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -17,6 +17,16 @@ #include #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +// The difference between "sequential_run" and "serial_run": +// "sequential_run" dispatches OPs one by one according to the sequence in the +// Program, while "serial_run" ensures that all Ops are scheduled in a singal +// thread. In standalone executor, "sequential_run" is also "serial_run", while +// "serial_run" is not necessarily "sequential_run". +PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run, + false, + "Enable sequential execution for standalone " + "executor, only applied to GPU OPs."); + namespace paddle { namespace framework { namespace interpreter { @@ -43,7 +53,7 @@ const std::string StringizeDownstreamMap( } const std::map>& DependencyBuilder::Build( - const std::vector& instructions, bool is_sequential_run) { + const std::vector& instructions) { PADDLE_ENFORCE_EQ( is_build_, false, @@ -56,7 +66,7 @@ const std::map>& DependencyBuilder::Build( BuildOpHappensBefore(); ShrinkDownstreamMap(); - if (is_sequential_run) { + if (FLAGS_new_executor_sequential_run) { AddDependencyForSequentialRun(); } diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h index ca7331d4b78e47..ec1119e701da3d 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.h @@ -36,7 +36,7 @@ class DependencyBuilder { // build op dependencies and return the mapping from op to its downstream-op // set const std::map>& Build( - const std::vector& instructions, bool is_sequential_run); + const std::vector& instructions); const std::map>& OpDownstreamMap() const; diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc index 88fac23338f543..fa33610096b7cc 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.cc @@ -70,21 +70,30 @@ inline std::string RunTypeToString(DownstreamRunType run_type) { } void StreamAnalyzer::ConstructEvents( - const DependencyBuilder& dependency_builder, std::vector* instructions) const { + std::vector cross_step_merged_instructions = *instructions; + for (const Instruction& instr : *instructions) { + cross_step_merged_instructions.emplace_back(instr); + } + + DependencyBuilder dependency_builder; + dependency_builder.Build(cross_step_merged_instructions); + const std::map>& downstream_map = dependency_builder.OpDownstreamMap(); - const size_t instr_num = instructions->size(); + const size_t instr_num = cross_step_merged_instructions.size(); std::vector>> run_type_info( instr_num, std::vector>( - /*number_of_run_type = */ 3)); // instr_id -> run_type -> + /*number_of_run_type = */ 2)); // instr_id -> run_type -> // next_instr_id - AnalyseAllRunType(*instructions, downstream_map, &run_type_info); + AnalyseAllRunType( + cross_step_merged_instructions, downstream_map, &run_type_info); std::map>> event_info; // DeviceContext -> waiter_instr_id -> recorder_instr_ids - AnalyseAllEventInfo(*instructions, run_type_info, &event_info); + AnalyseAllEventInfo( + cross_step_merged_instructions, run_type_info, &event_info); ShrinkEventInfo(dependency_builder, &event_info); // Construct events @@ -93,7 +102,17 @@ void StreamAnalyzer::ConstructEvents( for (auto& waiter_item : context_item.second) { size_t waiter_instr_id = waiter_item.first; std::set& recorder_instr_ids = waiter_item.second; + + if (waiter_instr_id >= instructions->size()) { + waiter_instr_id -= instructions->size(); + } + for (size_t recorder_instr_id : recorder_instr_ids) { + // Redundant record + if (recorder_instr_id >= instructions->size()) { + continue; + } + Instruction& recorder_instr = instructions->at(recorder_instr_id); Instruction& waiter_instr = instructions->at(waiter_instr_id); platform::DeviceType waiter_type = GetWaiterType(waiter_instr); diff --git a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h index b9a228869d4c96..de0e6c741c2451 100644 --- a/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h @@ -37,8 +37,7 @@ class StreamAnalyzer { ~StreamAnalyzer() {} - void ConstructEvents(const DependencyBuilder& dependency_builder, - std::vector* instructions) const; + void ConstructEvents(std::vector* instructions) const; platform::DeviceContext* ParseDeviceContext( const OpFuncNode& op_func_node) const; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 070230af4d7867..a0aa82102e315d 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -33,15 +33,6 @@ #endif #include "paddle/phi/backends/device_manager.h" -// The difference between "sequential_run" and "serial_run": -// "sequential_run" dispatches OPs one by one according to the sequence in the -// Program, while "serial_run" ensures that all Ops are scheduled in a singal -// thread. In standalone executor, "sequential_run" is also "serial_run", while -// "serial_run" is not necessarily "sequential_run". -PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run, - false, - "Enable sequential execution for standalone " - "executor, only applied to GPU OPs."); PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, false, "Use inplace in new executor"); @@ -519,9 +510,7 @@ void InterpreterCore::BuildOperatorDependences() { // and set the dependecy_count_ size_t instr_num = vec_instruction_.size(); dependecy_count_.resize(instr_num); - auto downstream_map = dependency_builder_.Build( - vec_instruction_, - /*is_sequential_run=*/FLAGS_new_executor_sequential_run); + auto downstream_map = dependency_builder_.Build(vec_instruction_); for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { Instruction& cur_instr = vec_instruction_[instr_id]; @@ -588,7 +577,13 @@ void InterpreterCore::Convert( BuildOperatorDependences(); - stream_analyzer_.ConstructEvents(dependency_builder_, &vec_instruction_); + // NOTE(Ruibiao): For cross-step stream synchronization, an event may be + // recorded in the first step and waited in the second step. So, in the first + // step, the WaitEvent may be called without RecordEvent. Considering that + // before the first call to RecordEvent, an Event represents an empty set of + // work and WaitEvent always return succeed immediately, we omit the + // prelude-record for the first step here. + stream_analyzer_.ConstructEvents(&vec_instruction_); // add event for the input var of jit program, since there are async copied // from gpu_pinned place to gpu place on compute stream. diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index f42270f34a2205..9fb4e0b7eebaf5 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -301,7 +301,7 @@ class Instruction { void AddEventToRecord(std::shared_ptr event, platform::DeviceType waiter_type) { - event_to_record_ = std::make_unique(id_, event, waiter_type); + event_to_record_ = std::make_shared(id_, event, waiter_type); } void AddEventToWait(size_t instr_id, @@ -379,7 +379,7 @@ class Instruction { std::vector next_instrs_in_different_thread; std::vector next_instrs_in_same_thread; - std::unique_ptr event_to_record_; + std::shared_ptr event_to_record_; std::vector events_to_wait_; OpFuncNode op_func_node_; diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index f67891feccc5ce..f765d9c22bbd5b 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -253,6 +253,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // problem, so we filter them out. std::vector params_not_shared; + auto *scope = param_scope(); // The node->inputs contains input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); @@ -264,6 +265,21 @@ void TensorRtSubgraphPass::CreateTensorRTOp( x->outputs.size() <= 1) { params_not_shared.push_back(x->Name()); } + // When TRT Engine's input is INT64, we need do some extra work. + // So we reserved a name for later use when casting INT64 -> INT32. + // We must check whether scope has had the same name var! + if (x->Var()->GetDataType() == framework::proto::VarType::INT64) { + std::string tmp_name = x->Name() + "_cast_to_INT32"; + LOG(WARNING) + << "tensorrt_subgraph's input named " << tmp_name + << " having int64 dtype in pdmodel description, we will cast them to " + "int32 dtype to feed them into paddle-trt."; + PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name), + nullptr, + platform::errors::InvalidArgument( + "The var name %s has exists in scope.", tmp_name)); + scope->Var(tmp_name); + } } auto model_precision = @@ -273,13 +289,18 @@ void TensorRtSubgraphPass::CreateTensorRTOp( std::set output_names; std::set output_names_with_id; - std::map origin_name_output_dims; + std::map origin_name_output_rank; std::unordered_set trt_outputs; + // record the origin output data type + std::vector origin_outputs_dtype; + std::map map_origin_outputs_dtype; for (auto *x : node->outputs) { output_names.insert(x->Name()); output_names_with_id.insert(x->Name() + std::to_string(x->id())); - origin_name_output_dims[x->Name()] = x->Var()->GetShape().size(); + origin_name_output_rank[x->Name()] = x->Var()->GetShape().size(); trt_outputs.insert(x); + map_origin_outputs_dtype[x->Name()] = + static_cast(x->Var()->GetDataType()); } OutputProcess( @@ -353,14 +374,34 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // output_mapping help us copy the data from the renamed ITensor // to Tensor. std::vector output_mapping; - std::vector renamed_output_dims; + std::vector renamed_output_rank; for (auto name : output_names) { PADDLE_ENFORCE_NE(output_name_map.count(name), 0, platform::errors::PreconditionNotMet( "The output_name_map should have %s", name)); output_mapping.push_back(output_name_map[name]); - renamed_output_dims.push_back(origin_name_output_dims[name]); + renamed_output_rank.push_back(origin_name_output_rank[name]); + origin_outputs_dtype.push_back(map_origin_outputs_dtype[name]); + + // When TRT Engine's output is INT64, we need do some extra work. + // So we reserved a name for later use when casting INT32 -> INT64. + // We must check whether scope has had the same name var! + if (static_cast( + map_origin_outputs_dtype[name]) == + framework::proto::VarType::INT64) { + std::string tmp_name = name + "_cast_to_INT64"; + LOG(WARNING) << "tensorrt_subgraph's output named " << tmp_name + << " having int64 dtype in pdmodel description, but in fact " + "it is int32 " + "dtype after executing this tensorrt_subgraph, so we " + "need cast them into int64."; + PADDLE_ENFORCE_EQ(scope->FindVar(tmp_name), + nullptr, + platform::errors::InvalidArgument( + "The var name %s has exists in scope.", tmp_name)); + scope->Var(tmp_name); + } } PADDLE_ENFORCE_EQ(output_mapping.empty(), false, @@ -381,11 +422,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp( op_desc->SetBlockAttr("sub_block", new_block); op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString()); + op_desc->SetAttr("origin_outputs_dtype", origin_outputs_dtype); op_desc->SetAttr("max_batch_size", max_batch_size); op_desc->SetAttr("workspace_size", Get("workspace_size")); op_desc->SetAttr("gpu_id", Get("gpu_device_id")); op_desc->SetAttr("output_name_mapping", output_mapping); - op_desc->SetAttr("origin_output_dims", renamed_output_dims); + op_desc->SetAttr("origin_output_rank", renamed_output_rank); op_desc->SetAttr("parameters", params); op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime); op_desc->SetAttr("shape_range_info_path", shape_range_info_path); @@ -548,7 +590,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp( LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " "kernel etc). This process may cost a lot of time."; - auto *scope = param_scope(); framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); std::unordered_set param_set(params.begin(), params.end()); inference::Singleton::Global() diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index b4d39e687203e3..58b0d2a1189ad7 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -948,12 +948,7 @@ void AnalysisConfig::Update() { #endif } -#ifdef PADDLE_WITH_MKLDNN - // Do not optimize when mkldnn is on - if (enable_memory_optim_ && !use_mkldnn_) { -#else if (enable_memory_optim_) { -#endif pass_builder()->AppendAnalysisPass("memory_optimize_pass"); } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 0cb7191ce7d261..518404ad197756 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -211,6 +211,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { "delete_quant_dequant_linear_op_pass", // "delete_weight_dequant_linear_op_pass", // "map_depthwise_conv_to_conv_pass", // + "constant_folding_pass", // "conv_bn_fuse_pass", // "conv_eltwiseadd_bn_fuse_pass", // "embedding_eltwise_layernorm_fuse_pass", // diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 255ef5d6d61945..301136d3533e04 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -732,7 +732,7 @@ TensorRTEngine::Weight TensorRTEngine::GetTrtWeight( for (int i = 0; i < weight_tensor.numel(); i++) { int32_data[i] = int64_data[i]; } - weight.SetDataType(phi::DataType::FLOAT32); + weight.SetDataType(phi::DataType::INT32); weight.SetValues(int32_data); } else { paddle::framework::TensorCopySync( diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 91876ab1544e1e..81e4ca89805ab2 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -60,6 +60,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) { case FluidDT::VarType_Type_FP32: return TRT_DT::kFLOAT; case FluidDT::VarType_Type_INT32: + case FluidDT::VarType_Type_INT64: return TRT_DT::kINT32; case FluidDT::VarType_Type_FP16: return TRT_DT::kHALF; @@ -68,10 +69,9 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kBOOL; #endif default: - return TRT_DT::kINT32; + PADDLE_THROW(platform::errors::InvalidArgument( + "unknown fluid datatype in TRT op converter")); } - PADDLE_THROW(platform::errors::InvalidArgument( - "unknown fluid datatype in TRT op converter")); return TRT_DT::kINT32; } diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 4367927bb17344..98d865247de54f 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -284,6 +284,15 @@ struct SimpleOpTypeSetTeller : public Teller { } } #endif + auto* block = desc.Block(); + if (block) { + auto* filter_var_desc = block->FindVar(desc.Input("Filter")[0]); + if (!filter_var_desc->Persistable()) { + VLOG(3) << "Trt not support filter is a intermediate tensor in " + "conv2d op."; + return false; + } + } } if (op_type == "deformable_conv") { @@ -1890,8 +1899,9 @@ struct SimpleOpTypeSetTeller : public Teller { return false; } } else { -#if !IS_TRT_VERSION_GE(8100) - VLOG(3) << "The version of TRT must be greater than 8100"; +#if (IS_TRT_VERSION_GE(8000) && IS_TRT_VERSION_LT(8100)) || \ + (IS_TRT_VERSION_LT(7200)) + VLOG(3) << "There are some bugs with trt 8.0"; return false; #endif } diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index e643efcb8b9f56..f93cb32a850efb 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -25,8 +25,8 @@ namespace cub = hipcub; #endif #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/batch_norm_op.h" -#include "paddle/fluid/operators/norm_utils.cu.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/kernels/funcs/math_function.h" DECLARE_bool(cudnn_batchnorm_spatial_persistent); @@ -36,7 +36,7 @@ namespace operators { using DataLayout = phi::DataLayout; template -using CudnnDataType = platform::CudnnDataType; +using CudnnDataType = phi::backends::gpu::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h index 40cdb68329fb27..0e579010a91d79 100644 --- a/paddle/fluid/operators/batch_norm_op.h +++ b/paddle/fluid/operators/batch_norm_op.h @@ -21,8 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/norm_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index 4023aaa8445f95..35d1b45408b1f4 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -21,10 +21,10 @@ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/fused/fused_bn_activation_op.h" -#include "paddle/fluid/operators/norm_utils.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" DECLARE_bool(cudnn_batchnorm_spatial_persistent); @@ -91,7 +91,7 @@ class FusedBatchNormActKernel int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); if ((N * H * W * D) == 1) { // Only 1 element in normalization dimension, @@ -257,7 +257,7 @@ class FusedBatchNormActGradKernel "The Input dim size should be between 2 and 5")); int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); // init output auto *d_x = ctx.Output(framework::GradVarName("X")); diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu index 4c4756b8e19792..4d92a7865eb2c9 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -20,11 +20,11 @@ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h" -#include "paddle/fluid/operators/norm_utils.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/float16.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/funcs/norm_utils.h" DECLARE_bool(cudnn_batchnorm_spatial_persistent); @@ -85,7 +85,7 @@ class FusedBatchNormAddActKernel int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; - ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); // ------------------- cudnn descriptors --------------------- auto handle = dev_ctx.cudnn_handle(); @@ -231,7 +231,7 @@ class FusedBatchNormAddActGradKernel int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; - ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); // init output auto *d_x = ctx.Output(framework::GradVarName("X")); diff --git a/paddle/fluid/operators/generator/CMakeLists.txt b/paddle/fluid/operators/generator/CMakeLists.txt index 53cbb990dc1ee9..62c11faadaf209 100644 --- a/paddle/fluid/operators/generator/CMakeLists.txt +++ b/paddle/fluid/operators/generator/CMakeLists.txt @@ -5,6 +5,7 @@ include(phi) set(op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml) set(legacy_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_ops.yaml) set(bw_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml) +set(static_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/static_ops.yaml) set(legacy_bw_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml) set(sparse_op_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_ops.yaml) @@ -29,10 +30,14 @@ set(parsed_op_dir ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator/parsed_ops) set(generated_op_path ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc) +set(generated_static_op_path + ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_static_op.cc) set(generated_sparse_ops_path ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_sparse_op.cc) set(generated_argument_mapping_path ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc) +set(generated_static_argument_mapping_path + ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_static_sig.cc) set(generated_sparse_argument_mapping_path ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sparse_sig.cc) @@ -54,6 +59,8 @@ execute_process( COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${legacy_bw_op_yaml_file} --output_path ./parsed_ops/legacy_backward_ops.parsed.yaml --backward + COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${static_op_yaml_file} + --output_path ./parsed_ops/static_ops.parsed.yaml COMMAND ${PYTHON_EXECUTABLE} parse_op.py --op_yaml_path ${sparse_op_yaml_file} --output_path ./parsed_ops/sparse_ops.parsed.yaml COMMAND @@ -75,7 +82,8 @@ execute_process( COMMAND ${PYTHON_EXECUTABLE} cross_validate.py --forward_yaml_paths ./parsed_ops/ops.parsed.yaml ./parsed_ops/legacy_ops.parsed.yaml - --backward_yaml_paths ./parsed_ops/backward_ops.parsed.yaml + ./parsed_ops/static_ops.parsed.yaml --backward_yaml_paths + ./parsed_ops/backward_ops.parsed.yaml ./parsed_ops/legacy_backward_ops.parsed.yaml RESULT_VARIABLE _result) if(${_result}) @@ -113,6 +121,20 @@ if(${_result}) message(FATAL_ERROR "operator codegen failed, exiting.") endif() +execute_process( + WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator + COMMAND + ${PYTHON_EXECUTABLE} generate_static_op.py --ops_yaml_path + ./parsed_ops/static_ops.parsed.yaml --op_version_yaml_path + ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/op_version.yaml + --op_compat_yaml_path ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/op_compat.yaml + --output_op_path "${generated_static_op_path}.tmp" --output_arg_map_path + "${generated_static_argument_mapping_path}.tmp" + RESULT_VARIABLE _result) +if(${_result}) + message(FATAL_ERROR "operator codegen failed, exiting.") +endif() + execute_process( WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generator COMMAND @@ -126,84 +148,33 @@ if(${_result}) message(FATAL_ERROR "sparse operator codegen failed, exiting.") endif() -if(EXISTS "${generated_op_path}.tmp" AND EXISTS "${generated_op_path}") - execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${generated_op_path}.tmp" "${generated_op_path}") - message("copy if different ${generated_op_path}.tmp ${generated_op_path}") -elseif(EXISTS "${generated_op_path}.tmp") - execute_process(COMMAND ${CMAKE_COMMAND} -E copy "${generated_op_path}.tmp" - "${generated_op_path}") - message("copy ${generated_op_path}.tmp ${generated_op_path}") -else() - execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f "${generated_op_path}") - message("remove ${generated_op_path}") -endif() - -if(EXISTS "${generated_sparse_ops_path}.tmp" AND EXISTS - "${generated_sparse_ops_path}") - execute_process( - COMMAND ${CMAKE_COMMAND} -E copy_if_different - "${generated_sparse_ops_path}.tmp" "${generated_sparse_ops_path}") - message( - "copy if different ${generated_sparse_ops_path}.tmp ${generated_sparse_ops_path}" - ) -elseif(EXISTS "${generated_sparse_ops_path}.tmp") - execute_process( - COMMAND ${CMAKE_COMMAND} -E copy "${generated_sparse_ops_path}.tmp" - "${generated_sparse_ops_path}") - message("copy ${generated_sparse_ops_path}.tmp ${generated_sparse_ops_path}") -else() - execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f - "${generated_sparse_ops_path}") - message("remove ${generated_sparse_ops_path}") -endif() - -if(EXISTS "${generated_argument_mapping_path}.tmp" - AND EXISTS "${generated_argument_mapping_path}") - execute_process( - COMMAND - ${CMAKE_COMMAND} -E copy_if_different - "${generated_argument_mapping_path}.tmp" - "${generated_argument_mapping_path}") - message( - "copy if different ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}" - ) -elseif(EXISTS "${generated_argument_mapping_path}.tmp") - execute_process( - COMMAND ${CMAKE_COMMAND} -E copy "${generated_argument_mapping_path}.tmp" - "${generated_argument_mapping_path}") - message( - "copy ${generated_argument_mapping_path}.tmp ${generated_argument_mapping_path}" - ) -else() - execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f - "${generated_argument_mapping_path}") - message("remove ${generated_argument_mapping_path}") -endif() - -if(EXISTS "${generated_sparse_argument_mapping_path}.tmp" - AND EXISTS "${generated_sparse_argument_mapping_path}") - execute_process( - COMMAND - ${CMAKE_COMMAND} -E copy_if_different - "${generated_sparse_argument_mapping_path}.tmp" - "${generated_sparse_argument_mapping_path}") - message( - "copy if different ${generated_sparse_argument_mapping_path}.tmp ${generated_sparse_argument_mapping_path}" - ) -elseif(EXISTS "${generated_sparse_argument_mapping_path}.tmp") - execute_process( - COMMAND - ${CMAKE_COMMAND} -E copy "${generated_sparse_argument_mapping_path}.tmp" - "${generated_sparse_argument_mapping_path}") - message( - "copy ${generated_sparse_argument_mapping_path}.tmp ${generated_sparse_argument_mapping_path}" - ) -else() - execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f - "${generated_sparse_argument_mapping_path}") - message("remove ${generated_sparse_argument_mapping_path}") -endif() +set(generated_static_files + "${generated_op_path}" + "${generated_static_op_path}" + "${generated_sparse_ops_path}" + "${generated_argument_mapping_path}" + "${generated_static_argument_mapping_path}" + "${generated_sparse_argument_mapping_path}") + +foreach(generated_static_file ${generated_static_files}) + if(EXISTS "${generated_static_file}.tmp" AND EXISTS + "${generated_static_file}") + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy_if_different + "${generated_static_file}.tmp" "${generated_static_file}") + message( + "copy if different ${generated_static_file}.tmp ${generated_static_file}") + elseif(EXISTS "${generated_static_file}.tmp") + execute_process( + COMMAND ${CMAKE_COMMAND} -E copy "${generated_static_file}.tmp" + "${generated_static_file}") + message("copy ${generated_static_file}.tmp ${generated_static_file}") + else() + execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f + "${generated_static_file}") + message("remove ${generated_static_file}") + endif() +endforeach() # op extra info file set(ops_extra_info_gen_file diff --git a/paddle/fluid/operators/generator/generate_static_op.py b/paddle/fluid/operators/generator/generate_static_op.py new file mode 100644 index 00000000000000..b24e60dc4da1ad --- /dev/null +++ b/paddle/fluid/operators/generator/generate_static_op.py @@ -0,0 +1,153 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os +from pathlib import Path + +import yaml +from filters import ( + cartesian_prod_mapping, + to_input_name, + to_int_array_tensor_name, + to_int_array_tensors_name, + to_op_attr_type, + to_opmaker_name, + to_opmaker_name_cstr, + to_pascal_case, + to_scalar_tensor_name, +) +from generate_op import replace_compat_name +from jinja2 import Environment, FileSystemLoader, StrictUndefined +from parse_utils import to_named_dict +from tests import ( + is_base_op, + is_initializer_list, + is_scalar, + is_vec, + supports_inplace, + supports_no_need_buffer, +) + +file_loader = FileSystemLoader(Path(__file__).parent / "templates") +env = Environment( + loader=file_loader, + keep_trailing_newline=True, + trim_blocks=True, + lstrip_blocks=True, + undefined=StrictUndefined, + extensions=['jinja2.ext.do'], +) +env.filters["to_op_attr_type"] = to_op_attr_type +env.filters["to_opmaker_name"] = to_opmaker_name +env.filters["to_pascal_case"] = to_pascal_case +env.filters["to_scalar_tensor_name"] = to_scalar_tensor_name +env.filters["to_int_array_tensor_name"] = to_int_array_tensor_name +env.filters["to_int_array_tensors_name"] = to_int_array_tensors_name +env.filters["to_input_name"] = to_input_name +env.filters["to_opmaker_name_cstr"] = to_opmaker_name_cstr +env.filters["cartesian_prod_mapping"] = cartesian_prod_mapping +env.tests["base_op"] = is_base_op +env.tests["vec"] = is_vec +env.tests["scalar"] = is_scalar +env.tests["initializer_list"] = is_initializer_list +env.tests["supports_inplace"] = supports_inplace +env.tests["supports_no_need_buffer"] = supports_no_need_buffer + + +def restruct_io(op): + op["input_dict"] = to_named_dict(op["inputs"]) + op["attr_dict"] = to_named_dict(op["attrs"]) + op["output_dict"] = to_named_dict(op["outputs"]) + return op + + +def main( + ops_yaml_path, + op_compat_yaml_path, + op_version_yaml_path, + output_op_path, + output_arg_map_path, +): + with open(ops_yaml_path, "rt") as f: + ops = yaml.safe_load(f) + ops = [restruct_io(op) for op in ops] + forward_op_dict = to_named_dict(ops) + + with open(op_version_yaml_path, "rt") as f: + op_versions = yaml.safe_load(f) + + # add op version info into op + for op_version in op_versions: + if op_version['op'] in forward_op_dict: + forward_op_dict[op_version['op']]['version'] = op_version['version'] + + with open(op_compat_yaml_path, "rt") as f: + op_op_map = yaml.safe_load(f) + + for op in ops: + op['op_name'] = op['name'] + + replace_compat_name(op_op_map, forward_op_dict, {}) + + if len(ops) == 0: + if os.path.isfile(output_op_path): + os.remove(output_op_path) + if os.path.isfile(output_arg_map_path): + os.remove(output_arg_map_path) + return + + op_template = env.get_template('op.c.j2') + with open(output_op_path, "wt") as f: + msg = op_template.render( + ops=ops, backward_ops=[], op_dict=forward_op_dict + ) + f.write(msg) + + ks_template = env.get_template('ks.c.j2') + with open(output_arg_map_path, 'wt') as f: + msg = ks_template.render(ops=ops, backward_ops=[]) + f.write(msg) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Generate operator file from op yaml." + ) + parser.add_argument( + '--ops_yaml_path', type=str, help="parsed static ops yaml file." + ) + parser.add_argument( + '--op_compat_yaml_path', type=str, help="ops args compat yaml file." + ) + parser.add_argument( + '--op_version_yaml_path', type=str, help="ops version yaml file." + ) + parser.add_argument( + "--output_op_path", type=str, help="path to save generated operators." + ) + parser.add_argument( + "--output_arg_map_path", + type=str, + help="path to save generated argument mapping functions.", + ) + + args = parser.parse_args() + main( + args.ops_yaml_path, + args.op_compat_yaml_path, + args.op_version_yaml_path, + args.output_op_path, + args.output_arg_map_path, + ) diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc deleted file mode 100644 index ebb588e8996b8c..00000000000000 --- a/paddle/fluid/operators/log_loss_op.cc +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class LogLossOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -template -class LogLossOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Predicted", - "The input value (Predicted) of Log loss op." - "Predicted is a 2-D tensor with shape [batch_size, 1]."); - AddInput("Labels", - "The target value (Labels) of Log loss op." - "Labels is a 2-D tensor with shape [batch_size, 1]."); - AddOutput("Loss", - "The output tensor with shape [batch_size, 1] " - "which represents the log loss."); - AddAttr("epsilon", "Epsilon in log loss."); - AddComment(R"DOC( -LogLoss Operator. - -Log loss is a loss function used for binary classification. Log Loss quantifies -the accuracy of a classifier by penalising false classifications. Minimising the -Log Loss is equivalent to maximising the accuracy of the classifier. We define -Predicted as the values predicted by our model and Labels as the target ground -truth value. Log loss can evaluate how close the predicted values are to the -target. The shapes of Predicted and Labels are both [batch_size, 1]. -The equation is: - -$$ -Loss = - Labels * log(Predicted + \epsilon) - - (1 - Labels) * log(1 - Predicted + \epsilon) -$$ - -)DOC"); - } -}; - -class LogLossGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK( - ctx->HasInput("Predicted"), "Input", "Predicted", "LogLossGrad"); - OP_INOUT_CHECK(ctx->HasInput("Labels"), "Input", "Labels", "LogLossGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Loss")), - "Input", - framework::GradVarName("Loss"), - "LogLossGrad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Predicted")), - "Output", - framework::GradVarName("Predicted"), - "LogLossGrad"); - - auto pred_dims = ctx->GetInputDim("Predicted"); - auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); - PADDLE_ENFORCE_EQ(loss_grad_dims, - pred_dims, - platform::errors::InvalidArgument( - "The dimensions of loss_grad must be equal to the " - "dimensions of Predicted," - "But received dimensions of loss_grad is [%s], " - "received Predicted is " - "[%s]", - loss_grad_dims, - pred_dims)); - - auto pred_grad_name = framework::GradVarName("Predicted"); - ctx->SetOutputDim(pred_grad_name, pred_dims); - } -}; - -template -class LogLossGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("log_loss_grad"); - op->SetInput("Predicted", this->Input("Predicted")); - op->SetInput("Labels", this->Input("Labels")); - op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); - op->SetOutput(framework::GradVarName("Predicted"), - this->InputGrad("Predicted")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(log_loss, - LogLossInferShapeFunctor, - PD_INFER_META(phi::LogLossInferMeta)); -REGISTER_OPERATOR(log_loss, - ops::LogLossOp, - ops::LogLossOpMaker, - ops::LogLossGradMaker, - ops::LogLossGradMaker, - LogLossInferShapeFunctor); -REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index 65a49dab27df25..5a540b802e60bd 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -99,9 +99,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { case ReshapeKernelOpName::reshape: InferShapeReshapeOp(ctx, x_dims, out_dims); break; - case ReshapeKernelOpName::reshape2: - InferShapeReshape2Op(ctx, x_dims, out_dims); - break; case ReshapeKernelOpName::squeeze: InferShapeSqueezeOp(ctx, x_dims, out_dims); break; @@ -127,17 +124,6 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); } - void InferShapeReshape2Op(const framework::ExecutionContext& ctx, - framework::DDim& x_dims, // NOLINT - framework::DDim& out_dims) const { // NOLINT - auto* out = ctx.Output("Out"); - auto* xshape = ctx.Output("XShape"); - auto xshape_dims = xshape->dims(); - x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - out_dims = out->dims(); - ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); - } - // in reshape1/2 ops "ShapeTensor" has highest priority and "Shape" has // second highest priority void ChangeReshapeOutDimsIfNeeded( @@ -400,14 +386,6 @@ REGISTER_OP_KERNEL( ops::ReshapeGradMKLDNNKernel); -REGISTER_OP_KERNEL( - reshape2, - MKLDNN, - paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); - REGISTER_OP_KERNEL( reshape2_grad, MKLDNN, diff --git a/paddle/fluid/operators/norm_utils.h b/paddle/fluid/operators/norm_utils.h deleted file mode 100644 index edaf19f68f1b38..00000000000000 --- a/paddle/fluid/operators/norm_utils.h +++ /dev/null @@ -1,51 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include - -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using DataLayout = phi::DataLayout; - -inline void ExtractNCWHD(const framework::DDim &dims, - const DataLayout &data_layout, - int *N, - int *C, - int *H, - int *W, - int *D) { - *N = dims[0]; - if (dims.size() == 2) { - *C = dims[1]; - *H = 1; - *W = 1; - *D = 1; - } else { - *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1]; - *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1]; - *W = dims.size() > 3 - ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2]) - : 1; - *D = dims.size() > 4 - ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3]) - : 1; - } -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/put_along_axis_op.cc b/paddle/fluid/operators/put_along_axis_op.cc deleted file mode 100644 index 551dba6d839ed4..00000000000000 --- a/paddle/fluid/operators/put_along_axis_op.cc +++ /dev/null @@ -1,111 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/ternary.h" - -namespace paddle { -namespace operators { - -class PutAlongAxisOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "Input"), - ctx.device_context()); - } -}; - -class PutAlongAxisOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Input", "The input tensor of PutAlongAxisOp"); - AddInput("Index", "The index tensor of PutAlongAxisOp"); - AddInput("Value", "The value tensor of PutAlongAxisOp"); - AddOutput("Result", "The result tensor of PutAlongAxisOp"); - AddAttr("Axis", "The axis that we do PutAlongAxis operation"); - AddAttr("Reduce", "The reduce operation for scatter") - .SetDefault("assign"); - AddComment(R"DOC( - PutAlongAxis Operator.) - )DOC"); - } -}; - -class PutAlongAxisGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Result")), - ctx.device_context()); - } -}; - -template -class PutAlongAxisGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("put_along_axis_grad"); - op->SetInput("Index", this->Input("Index")); - op->SetInput("Input", this->Input("Input")); - - op->SetInput(framework::GradVarName("Result"), this->OutputGrad("Result")); - op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - op->SetOutput(framework::GradVarName("Value"), this->InputGrad("Value")); - op->SetAttrMap(this->Attrs()); - } -}; - -DECLARE_INPLACE_OP_INFERER(PutAlongAxisInplaceInferer, {"Input", "Result"}); - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(put_along_axis, - PutAlongAxisInferShapeFunctor, - PD_INFER_META(phi::PutAlongAxisInferMeta)); -REGISTER_OPERATOR(put_along_axis, - ops::PutAlongAxisOp, - ops::PutAlongAxisOpMaker, - ops::PutAlongAxisGradOpMaker, - ops::PutAlongAxisGradOpMaker, - paddle::operators::PutAlongAxisInplaceInferer, - PutAlongAxisInferShapeFunctor); - -REGISTER_OPERATOR(put_along_axis_grad, ops::PutAlongAxisGradOp); diff --git a/paddle/fluid/operators/searchsorted_op.cc b/paddle/fluid/operators/searchsorted_op.cc deleted file mode 100644 index 1beb06366ea919..00000000000000 --- a/paddle/fluid/operators/searchsorted_op.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class SearchSortedOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto data_type = - OperatorWithKernel::IndicateVarDataType(ctx, "SortedSequence"); - return framework::OpKernelType(data_type, ctx.device_context()); - } -}; - -class SearchSortedOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("SortedSequence", - "(Tensor), N-D or 1-D tensor, The value of the tensor" - "monotonically increases in the innermost dimension."); - AddInput("Values", "(Tensor), N-D tensor given values."); - AddOutput("Out", "(Tensor), The output tensor of searchsorted op."); - AddAttr("out_int32", - "the output tensor is int64 type if False and on the" - "contrary for int32") - .SetDefault(false); - AddAttr( - "right", - "corresponding to lower bound if False and upper bound if True") - .SetDefault(false); - - AddComment(R"DOC( - Searchsorted Operator. - - This OP is used to find the index of the corresponding sorted_sequence in the innermost dimension based on the given values. - -)DOC"); - } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(searchsorted, - SearchsortedInferShapeFunctor, - PD_INFER_META(phi::SearchsortedInferMeta)); -REGISTER_OPERATOR(searchsorted, - ops::SearchSortedOp, - ops::SearchSortedOpMaker, - SearchsortedInferShapeFunctor); diff --git a/paddle/fluid/operators/svd_op.cc b/paddle/fluid/operators/svd_op.cc deleted file mode 100644 index afbfd80b8d5379..00000000000000 --- a/paddle/fluid/operators/svd_op.cc +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/infermeta/unary.h" - -namespace paddle { -namespace operators { - -class SvdOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; -}; - -class SvdOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("X", "(Tensor), The input tensor of svd op."); - AddOutput("U", "(Tensor), The output U tensor of svd op."); - AddOutput("S", "(Tensor), The output S tensor of svd op."); - AddOutput("VH", "(Tensor), The output VH tensor of svd op."); - AddAttr("full_matrices", - "(bool, default false) Only Compute the thin U and V" - "when set as True, the gradient have some random " - "attribute.") - .SetDefault(false); - AddComment(R"DOC( -Svd Operator. - -This operator is used to perform SVD operation for batched matrics $X$. -$$U, S, VH = svd(X)$$ - -)DOC"); - } -}; - -class SvdGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("U")), - "Input", - "U@Grad", - "SvdGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("VH")), - "Input", - "VH@Grad", - "SvdGrad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("S")), - "Input", - "S@Grad", - "SvdGrad"); - OP_INOUT_CHECK(ctx->HasInput("U"), "Input", "U", "SvdGrad"); - OP_INOUT_CHECK(ctx->HasInput("S"), "Input", "S", "SvdGrad"); - OP_INOUT_CHECK(ctx->HasInput("VH"), "Input", "VH", "SvdGrad"); - OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), - "Output", - "X@Grad", - "SvdGrad"); - - auto d_x = ctx->GetInputDim(("X")); - ctx->SetOutputDim(framework::GradVarName("X"), d_x); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); - return framework::OpKernelType(dtype, ctx.GetPlace()); - } -}; - -template -class SvdGradMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - void Apply(GradOpPtr retv) const override { - retv->SetType("svd_grad"); - retv->SetInput(framework::GradVarName("U"), this->OutputGrad("U")); - retv->SetInput(framework::GradVarName("VH"), this->OutputGrad("VH")); - retv->SetInput(framework::GradVarName("S"), this->OutputGrad("S")); - retv->SetInput("U", this->Output("U")); - retv->SetInput("VH", this->Output("VH")); - retv->SetInput("S", this->Output("S")); - retv->SetInput("X", this->Input("X")); - retv->SetAttrMap(this->Attrs()); - retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; - -DECLARE_INFER_SHAPE_FUNCTOR(svd, - SvdInferShapeFunctor, - PD_INFER_META(phi::SvdInferMeta)); - -REGISTER_OPERATOR(svd, - ops::SvdOp, - ops::SvdOpMaker, - ops::SvdGradMaker, - ops::SvdGradMaker, - SvdInferShapeFunctor); - -REGISTER_OPERATOR(svd_grad, ops::SvdGradOp); diff --git a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc index 2d037a7c3ecc1a..6d9e161806d820 100644 --- a/paddle/fluid/operators/sync_batch_norm_op_mlu.cc +++ b/paddle/fluid/operators/sync_batch_norm_op_mlu.cc @@ -72,7 +72,7 @@ class SyncBatchNormMLUKernel : public framework::OpKernel { "The Input dim size should be less than 6.")); int N, C, H, W, D; - ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); y->mutable_data(ctx.GetPlace()); mean_out->mutable_data(ctx.GetPlace()); @@ -320,7 +320,7 @@ class SyncBatchNormMLUGradKernel : public framework::OpKernel { "The Input X dim size should be less than 6.")); int N, C, H, W, D; - ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); PADDLE_ENFORCE_EQ(scale->dims()[0], C, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc index 46b1ccc140ddb8..b25ca5b3823cef 100644 --- a/paddle/fluid/operators/sync_batch_norm_op_npu.cc +++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc @@ -344,7 +344,7 @@ class SyncBatchNormNPUKernel : public framework::OpKernel { x_dims.size())); int N, C, H, W, D; - ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); int x_numel = x->numel(); auto place = ctx.GetPlace(); @@ -598,7 +598,7 @@ class SyncBatchNormNPUGradKernel : public framework::OpKernel { } int N, C, H, W, D; - ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D); + phi::funcs::ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D); int x_numel = x->numel(); auto place = ctx.GetPlace(); diff --git a/paddle/fluid/operators/take_along_axis_op.cc b/paddle/fluid/operators/take_along_axis_op.cc deleted file mode 100644 index 5e3424a552bf92..00000000000000 --- a/paddle/fluid/operators/take_along_axis_op.cc +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include - -#include "paddle/fluid/framework/infershape_utils.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/phi/core/ddim.h" -#include "paddle/phi/core/infermeta_utils.h" -#include "paddle/phi/infermeta/binary.h" - -namespace paddle { -namespace operators { - -class TakeAlongAxisOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "Input"), - ctx.device_context()); - } -}; - -class TakeAlongAxisOpMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput("Input", "The input tensor of TakeAlongAxisOp"); - AddInput("Index", "The index tensor of TakeAlongAxisOp"); - AddOutput("Result", "The result tensor of TakeAlongAxisOp"); - AddAttr("Axis", - "The Tensor which contains the axis that we do TakeAlongAxis " - "operation."); - AddComment(R"DOC( - Take_along_axis Operator.) - )DOC"); - } -}; - -class TakeAlongAxisGradOp : public framework::OperatorWithKernel { - public: - using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("Input"), - ctx->GetInputDim("Input")); - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Result")), - ctx.device_context()); - } -}; - -template -class TakeAlongAxisGradOpMaker : public framework::SingleGradOpMaker { - public: - using framework::SingleGradOpMaker::SingleGradOpMaker; - - protected: - void Apply(GradOpPtr op) const override { - op->SetType("take_along_axis_grad"); - op->SetInput("Index", this->Input("Index")); - op->SetInput("Input", this->Input("Input")); - - op->SetInput(framework::GradVarName("Result"), this->OutputGrad("Result")); - op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); - op->SetAttrMap(this->Attrs()); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -DECLARE_INFER_SHAPE_FUNCTOR(take_along_axis, - TakeAlongAxisInferShapeFunctor, - PD_INFER_META(phi::TakeAlongAxisInferMeta)); -REGISTER_OPERATOR(take_along_axis, - ops::TakeAlongAxisOp, - ops::TakeAlongAxisOpMaker, - ops::TakeAlongAxisGradOpMaker, - ops::TakeAlongAxisGradOpMaker, - TakeAlongAxisInferShapeFunctor); - -REGISTER_OPERATOR(take_along_axis_grad, ops::TakeAlongAxisGradOp); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 38cf5e2b823466..579549a4c3ec47 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -21,13 +21,13 @@ #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/place.h" #ifdef PADDLE_WITH_CUDA - #include #include #include #include #include #include +#include "paddle/phi/kernels/cast_kernel.h" #include "paddle/fluid/framework/data_device_transform.h" #include "paddle/fluid/framework/executor.h" @@ -596,7 +596,14 @@ class TensorRTEngineOp : public framework::OperatorBase { if (type == framework::proto::VarType::FP32) { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::INT64) { - buffers[bind_index] = static_cast(t.data()); + auto int32_tensor = + scope.FindVar(x + "_cast_to_INT32")->GetMutable(); + *int32_tensor = phi::Cast( + reinterpret_cast(dev_ctx), + t, + phi::DataType::INT32); + buffers[bind_index] = + static_cast(int32_tensor->data()); } else if (type == framework::proto::VarType::INT32) { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::FP16) { @@ -614,8 +621,8 @@ class TensorRTEngineOp : public framework::OperatorBase { // Bind output tensor to TRT. int output_index = 0; - std::vector origin_output_dims = - Attr>("origin_output_dims"); + std::vector origin_output_rank = + Attr>("origin_output_rank"); VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { const int bind_index = @@ -636,7 +643,7 @@ class TensorRTEngineOp : public framework::OperatorBase { for (; nb_dims > 0; nb_dims--) { // some 'x 1' of shape is normal, no need to remove it if (dims.d[nb_dims - 1] != 1 || - nb_dims == origin_output_dims[output_index]) + nb_dims == origin_output_rank[output_index]) break; } for (int i = 0; i < nb_dims; i++) ddim.push_back(dims.d[i]); @@ -694,6 +701,28 @@ class TensorRTEngineOp : public framework::OperatorBase { } // Execute the engine. engine->Execute(runtime_batch, &buffers, stream); + + std::vector origin_outputs_dtype = + Attr>("origin_outputs_dtype"); + for (size_t i = 0; i < Outputs("Ys").size(); i++) { + auto type = + static_cast(origin_outputs_dtype[i]); + + if (type == framework::proto::VarType::INT64) { + auto y = Outputs("Ys")[i]; + auto *fluid_v = scope.FindVar(y); + auto *fluid_t = fluid_v->GetMutable(); + auto int32_tensor = + scope.FindVar(y + "_cast_to_INT64")->GetMutable(); + int32_tensor->Resize(fluid_t->dims()); + dev_ctx.Alloc(int32_tensor); + framework::TensorCopy(*fluid_t, dev_place, dev_ctx, int32_tensor); + *fluid_t = phi::Cast( + reinterpret_cast(dev_ctx), + *int32_tensor, + phi::DataType::INT64); + } + } } TensorRTEngine *GetEngine(const framework::Scope &scope, diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 6d37290d151485..49a74cc8800975 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -104,6 +104,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetInput("Xs", std::vector({"x"})); engine_op_desc.SetOutput("Ys", std::vector({"z0"})); + engine_op_desc.SetAttr("origin_outputs_dtype", std::vector{5}); engine_op_desc.SetBlockAttr("sub_block", &block_desc); engine_op_desc.SetAttr("max_batch_size", static_cast(2)); @@ -119,7 +120,7 @@ void DynamicShapeTest(bool allow_build_at_runtime) { engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); - engine_op_desc.SetAttr("origin_output_dims", std::vector({2})); + engine_op_desc.SetAttr("origin_output_rank", std::vector({2})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("engine_serialized_data", std::string("")); int device_id = 0; @@ -274,7 +275,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); - engine_op_desc.SetAttr("origin_output_dims", std::vector({2})); + engine_op_desc.SetAttr("origin_output_rank", std::vector({2})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); engine_op_desc.SetAttr("engine_serialized_data", std::string("")); int device_id = 0; diff --git a/paddle/fluid/platform/dynload/cusolver.h b/paddle/fluid/platform/dynload/cusolver.h index 854de23150cad7..c49c30eb65c42d 100644 --- a/paddle/fluid/platform/dynload/cusolver.h +++ b/paddle/fluid/platform/dynload/cusolver.h @@ -96,13 +96,22 @@ CUSOLVER_ROUTINE_EACH_R1(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif #if CUDA_VERSION >= 9020 -#define CUSOLVER_ROUTINE_EACH_R2(__macro) \ - __macro(cusolverDnCreateSyevjInfo); \ - __macro(cusolverDnSsyevj_bufferSize); \ - __macro(cusolverDnDsyevj_bufferSize); \ - __macro(cusolverDnSsyevj); \ - __macro(cusolverDnDsyevj); \ - __macro(cusolverDnDestroySyevjInfo); +#define CUSOLVER_ROUTINE_EACH_R2(__macro) \ + __macro(cusolverDnCreateSyevjInfo); \ + __macro(cusolverDnSsyevj_bufferSize); \ + __macro(cusolverDnDsyevj_bufferSize); \ + __macro(cusolverDnSsyevj); \ + __macro(cusolverDnDsyevj); \ + __macro(cusolverDnDestroySyevjInfo); \ + __macro(cusolverDnXsyevjSetSortEig); \ + __macro(cusolverDnSsyevjBatched_bufferSize); \ + __macro(cusolverDnDsyevjBatched_bufferSize); \ + __macro(cusolverDnCheevjBatched_bufferSize); \ + __macro(cusolverDnZheevjBatched_bufferSize); \ + __macro(cusolverDnSsyevjBatched); \ + __macro(cusolverDnDsyevjBatched); \ + __macro(cusolverDnCheevjBatched); \ + __macro(cusolverDnZheevjBatched); CUSOLVER_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml index 2e5ca9ff4916e2..4cd0c5ab341c72 100644 --- a/paddle/phi/api/yaml/backward.yaml +++ b/paddle/phi/api/yaml/backward.yaml @@ -676,6 +676,16 @@ backward : log_double_grad inplace : (out_grad -> x_grad) +- backward_op : log_loss_grad + forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out) + args : (Tensor input, Tensor label, Tensor out_grad, float epsilon) + output : Tensor(input_grad) + infer_meta : + func : UnchangedInferMeta + param : [input] + kernel : + func : log_loss_grad + - backward_op : logit_grad forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out) args : (Tensor x, Tensor out_grad, float eps) @@ -779,6 +789,16 @@ kernel : func : poisson_grad +- backward_op : put_along_axis_grad + forward : put_along_axis (Tensor arr, Tensor indices, Tensor value, int axis, str reduce = "assign") -> Tensor(out) + args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce) + output : Tensor(arr_grad), Tensor(value_grad) + infer_meta : + func : GeneralBinaryGradInferMeta + param : [arr, indices] + kernel : + func : put_along_axis_grad + - backward_op : qr_grad forward : qr (Tensor x, str mode = "reduced") -> Tensor(q), Tensor(r) args : (Tensor x, Tensor q, Tensor r, Tensor q_grad, Tensor r_grad, str mode) @@ -1062,6 +1082,27 @@ backward : square_double_grad inplace : (out_grad -> x_grad) +- backward_op : svd_grad + forward : svd (Tensor x, bool full_matrices = false) -> Tensor(u), Tensor(s), Tensor(vh) + args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices) + output : Tensor(x_grad) + infer_meta : + func : UnchangedInferMeta + param : [x] + kernel : + func : svd_grad + optional: u_grad, vh_grad, s_grad + +- backward_op : take_along_axis_grad + forward : take_along_axis (Tensor arr, Tensor indices, int axis) -> Tensor(out) + args : (Tensor arr, Tensor indices, Tensor out_grad, int axis) + output : Tensor(arr_grad) + infer_meta : + func : UnchangedInferMeta + param : [arr] + kernel : + func : take_along_axis_grad + - backward_op : tan_grad forward : tan (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad) diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml index 4001a75d0fa1d4..51e49ef831c473 100755 --- a/paddle/phi/api/yaml/legacy_backward.yaml +++ b/paddle/phi/api/yaml/legacy_backward.yaml @@ -745,16 +745,6 @@ func : linear_interp_grad data_type : output_grad -- backward_op : log_loss_grad - forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out) - args : (Tensor input, Tensor label, Tensor out_grad, float epsilon) - output : Tensor(input_grad) - infer_meta : - func : UnchangedInferMeta - param : [input] - kernel : - func : log_loss_grad - - backward_op : log_softmax_grad forward : log_softmax(Tensor x, int axis) -> Tensor(out) args : (Tensor out, Tensor out_grad, int axis) @@ -1195,17 +1185,6 @@ data_type : x optional : boxes_num -# output is optional -- backward_op : put_along_axis_grad - forward : put_along_axis (Tensor arr, Tensor indices, Tensor value, int axis, str reduce) -> Tensor(out) - args : (Tensor arr, Tensor indices, Tensor out_grad, int axis, str reduce) - output : Tensor(arr_grad), Tensor(value_grad) - infer_meta : - func : GeneralBinaryGradInferMeta - param : [arr, indices] - kernel : - func : put_along_axis_grad - - backward_op : real_grad forward : real (Tensor x) -> Tensor(out) args : (Tensor out_grad) @@ -1573,17 +1552,6 @@ no_need_buffer : x backward : sum_double_grad -- backward_op : svd_grad - forward : svd (Tensor x, bool full_matrices) -> Tensor(u), Tensor(s), Tensor(vh) - args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full_matrices) - output : Tensor(x_grad) - infer_meta : - func : UnchangedInferMeta - param : [x] - kernel : - func : svd_grad - optional: u_grad, vh_grad, s_grad - - backward_op : swish_grad forward : swish (Tensor x) -> Tensor(out) args : (Tensor x, Tensor out_grad, float bete=1.0) @@ -1607,16 +1575,6 @@ data_type : out_grad optional : reserve_space -- backward_op : take_along_axis_grad - forward : take_along_axis (Tensor arr, Tensor indices, int axis) -> Tensor(out) - args : (Tensor arr, Tensor indices, Tensor out_grad, int axis) - output : Tensor(arr_grad) - infer_meta : - func : UnchangedInferMeta - param : [arr] - kernel : - func : take_along_axis_grad - - backward_op : temporal_shift_grad forward : temporal_shift(Tensor x, int seg_num, float shift_ratio, str data_format_str) -> Tensor(out) args : (Tensor out_grad, int seg_num, float shift_ratio, str data_format_str) diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml index 2382739377eece..4b697f0182be29 100755 --- a/paddle/phi/api/yaml/legacy_ops.yaml +++ b/paddle/phi/api/yaml/legacy_ops.yaml @@ -1068,15 +1068,6 @@ data_type : dtype backend : place -- op : log_loss - args : (Tensor input, Tensor label, float epsilon) - output : Tensor - infer_meta : - func : LogLossInferMeta - kernel : - func : log_loss - backward : log_loss_grad - - op : log_softmax args : (Tensor x, int axis) output : Tensor(out) @@ -1555,18 +1546,6 @@ optional : boxes_num backward : psroi_pool_grad -- op : put_along_axis - args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce) - output : Tensor(out) - infer_meta : - func : UnchangedInferMeta - param : [arr] - kernel : - func : put_along_axis - data_type : arr - inplace : (arr -> out) - backward : put_along_axis_grad - - op : randint args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={}) output : Tensor(out) @@ -1750,15 +1729,6 @@ func : scatter_nd_add backward : scatter_nd_add_grad -- op : searchsorted - args : (Tensor sorted_sequence, Tensor values, bool out_int32, bool right) - output : Tensor(out) - infer_meta : - func : SearchsortedInferMeta - kernel : - func : searchsorted - data_type : sorted_sequence - - op : segment_pool args : (Tensor x, Tensor segment_ids, str pooltype) output : Tensor(out), Tensor(summed_ids) @@ -1968,15 +1938,6 @@ data_type : x backward : sum_grad -- op : svd - args : (Tensor x, bool full_matrices) - output : Tensor(u), Tensor(s), Tensor(vh) - infer_meta : - func : SvdInferMeta - kernel : - func : svd - backward : svd_grad - - op : swish args : (Tensor x) output : Tensor(out) @@ -1998,17 +1959,6 @@ backward : sync_batch_norm_grad inplace : (mean -> mean_out), (variance -> variance_out) -- op : take_along_axis - args : (Tensor arr, Tensor indices, int axis) - output : Tensor - infer_meta : - func : UnchangedInferMeta - param : [indices] - kernel : - func : take_along_axis - data_type : arr - backward : take_along_axis_grad - - op : temporal_shift args : (Tensor x, int seg_num, float shift_ratio, str data_format_str) output : Tensor diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml index 4945794dc58f5c..63c6c5c38f54f7 100644 --- a/paddle/phi/api/yaml/op_compat.yaml +++ b/paddle/phi/api/yaml/op_compat.yaml @@ -750,6 +750,13 @@ extra : attrs : [bool use_mkldnn = false, bool use_cudnn = false] +- op : log_loss + backward : log_loss_grad + inputs : + {input : Predicted, label : Labels} + outputs : + out : Loss + - op : log_softmax backward : log_softmax_grad extra : @@ -916,6 +923,15 @@ extra : attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false] +- op : put_along_axis + backward : put_along_axis_grad + inputs : + {arr : Input, indices : Index, values : Value} + outputs : + out : Result + attrs : + {axis : Axis, reduce : Reduce} + - op : qr backward : qr_grad inputs : @@ -1029,6 +1045,12 @@ extra : attrs : [bool use_mkldnn = false] +- op : searchsorted + inputs : + {sorted_sequence : SortedSequence, values : Values} + outputs : + out : Out + - op : seed extra : attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false] @@ -1176,6 +1198,13 @@ attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32", bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f] +- op : svd + backward : svd_grad + inputs : + x : X + outputs : + {u : U, s : S, vh : VH} + - op : swish backward : swish_grad extra : @@ -1186,6 +1215,15 @@ extra : attrs : [bool use_mkldnn = false, bool fuse_with_relu = false] +- op : take_along_axis + backward : take_along_axis_grad + inputs : + {arr : Input, indices : Index} + outputs : + out : Result + attrs : + axis : Axis + - op : tan backward : tan_grad inputs : diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index e51f23dda220fa..32fe25624fe7b9 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -628,6 +628,15 @@ func : log2 backward: log2_grad +- op : log_loss + args : (Tensor input, Tensor label, float epsilon) + output : Tensor + infer_meta : + func : LogLossInferMeta + kernel : + func : log_loss + backward : log_loss_grad + - op : logit args : (Tensor x, float eps = 1e-6f) output : Tensor @@ -741,6 +750,18 @@ func : poisson backward : poisson_grad +- op : put_along_axis + args : (Tensor arr, Tensor indices, Tensor values, int axis, str reduce = "assign") + output : Tensor(out) + infer_meta : + func : UnchangedInferMeta + param : [arr] + kernel : + func : put_along_axis + data_type : arr + inplace : (arr -> out) + backward : put_along_axis_grad + - op : qr args : (Tensor x, str mode = "reduced") output : Tensor(q), Tensor(r) @@ -800,6 +821,15 @@ inplace : (x -> out) backward : rsqrt_grad +- op : searchsorted + args : (Tensor sorted_sequence, Tensor values, bool out_int32 = false, bool right = false) + output : Tensor(out) + infer_meta : + func : SearchsortedInferMeta + kernel : + func : searchsorted + data_type : sorted_sequence + - op : send_uv args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") output : Tensor(out) @@ -907,6 +937,26 @@ square_sr {selected_rows -> selected_rows} backward : square_grad +- op : svd + args : (Tensor x, bool full_matrices = false) + output : Tensor(u), Tensor(s), Tensor(vh) + infer_meta : + func : SvdInferMeta + kernel : + func : svd + backward : svd_grad + +- op : take_along_axis + args : (Tensor arr, Tensor indices, int axis) + output : Tensor + infer_meta : + func : TakeAlongAxisInferMeta + param : [arr, indices, axis] + kernel : + func : take_along_axis + data_type : arr + backward : take_along_axis_grad + - op : tan args : (Tensor x) output : Tensor @@ -981,11 +1031,3 @@ kernel : func : unfold backward : unfold_grad - -- op: share_buffer - args : (Tensor[] x, bool[] share_dims_and_dtype={}) - output : Tensor[](out){x.size()}, Tensor[](xout){x.size()} - infer_meta : - func : ShareBufferInferMeta - kernel : - func : share_buffer diff --git a/paddle/phi/api/yaml/static_ops.yaml b/paddle/phi/api/yaml/static_ops.yaml new file mode 100644 index 00000000000000..1849b9f6c1e48e --- /dev/null +++ b/paddle/phi/api/yaml/static_ops.yaml @@ -0,0 +1,7 @@ +- op : share_buffer + args : (Tensor[] x, bool[] share_dims_and_dtype={}) + output : Tensor[](out){x.size()}, Tensor[](xout){x.size()} + infer_meta : + func : ShareBufferInferMeta + kernel : + func : share_buffer diff --git a/paddle/phi/backends/dynload/cusolver.h b/paddle/phi/backends/dynload/cusolver.h index 1354e310554804..a86e85144fd7fb 100644 --- a/paddle/phi/backends/dynload/cusolver.h +++ b/paddle/phi/backends/dynload/cusolver.h @@ -108,13 +108,22 @@ CUSOLVER_ROUTINE_EACH_R1(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif #if CUDA_VERSION >= 9020 -#define CUSOLVER_ROUTINE_EACH_R2(__macro) \ - __macro(cusolverDnCreateSyevjInfo); \ - __macro(cusolverDnSsyevj_bufferSize); \ - __macro(cusolverDnDsyevj_bufferSize); \ - __macro(cusolverDnSsyevj); \ - __macro(cusolverDnDsyevj); \ - __macro(cusolverDnDestroySyevjInfo); +#define CUSOLVER_ROUTINE_EACH_R2(__macro) \ + __macro(cusolverDnCreateSyevjInfo); \ + __macro(cusolverDnSsyevj_bufferSize); \ + __macro(cusolverDnDsyevj_bufferSize); \ + __macro(cusolverDnSsyevj); \ + __macro(cusolverDnDsyevj); \ + __macro(cusolverDnDestroySyevjInfo); \ + __macro(cusolverDnXsyevjSetSortEig); \ + __macro(cusolverDnSsyevjBatched_bufferSize); \ + __macro(cusolverDnDsyevjBatched_bufferSize); \ + __macro(cusolverDnCheevjBatched_bufferSize); \ + __macro(cusolverDnZheevjBatched_bufferSize); \ + __macro(cusolverDnSsyevjBatched); \ + __macro(cusolverDnDsyevjBatched); \ + __macro(cusolverDnCheevjBatched); \ + __macro(cusolverDnZheevjBatched); CUSOLVER_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSOLVER_WRAP) #endif diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index 6e87f40ed0ab07..467552032f0ad6 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -56,6 +56,9 @@ void Copy(const Context& dev_ctx, void* dst_ptr = nullptr; if (paddle::platform::is_cpu_place(dst_place)) { dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); +#ifdef PADDLE_WITH_MKLDNN + dst->set_layout(src.layout()); +#endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) } else if (paddle::platform::is_gpu_place(dst_place) || paddle::platform::is_cuda_pinned_place(dst_place)) { @@ -81,7 +84,7 @@ void Copy(const Context& dev_ctx, PADDLE_ENFORCE_EQ( dst->place(), dst_place, - phi::errors::Unavailable( + errors::Unavailable( "The Dst Tensor's place and dst_place do not match, Tensor's place " "place is %s, dst_place is %s.", dst->place(), @@ -112,13 +115,13 @@ void Copy(const Context& dev_ctx, PADDLE_ENFORCE_EQ( paddle::platform::is_gpu_place(ctx_place), true, - phi::errors::PreconditionNotMet( + errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place, - phi::errors::Unavailable( + errors::Unavailable( "Source place and context place do not match, source " "place is %s, context place is %s.", src_gpu_place, @@ -137,17 +140,17 @@ void Copy(const Context& dev_ctx, PADDLE_ENFORCE_EQ( paddle::platform::is_gpu_place(ctx_place), true, - phi::errors::PreconditionNotMet( + errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); auto ctx_gpu_place = ctx_place; - PADDLE_ENFORCE_EQ(dst_gpu_place, - ctx_gpu_place, - phi::errors::Unavailable( - "Destination place and context place do not match, " - "destination place is %s, context place is %s.", - dst_gpu_place, - ctx_gpu_place)); + PADDLE_ENFORCE_EQ( + dst_gpu_place, + ctx_gpu_place, + errors::Unavailable("Destination place and context place do not match, " + "destination place is %s, context place is %s.", + dst_gpu_place, + ctx_gpu_place)); auto stream = blocking ? nullptr : reinterpret_cast(dev_ctx).stream(); @@ -161,7 +164,7 @@ void Copy(const Context& dev_ctx, PADDLE_ENFORCE_EQ( paddle::platform::is_gpu_place(ctx_place), true, - phi::errors::PreconditionNotMet( + errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); auto stream = @@ -184,7 +187,7 @@ void Copy(const Context& dev_ctx, paddle::memory::Copy( dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream); } else { - PADDLE_THROW(phi::errors::Unavailable( + PADDLE_THROW(errors::Unavailable( "Context place dose not match the source and destination place.")); } } @@ -196,13 +199,13 @@ void Copy(const Context& dev_ctx, PADDLE_ENFORCE_EQ( paddle::platform::is_gpu_place(ctx_place), true, - phi::errors::PreconditionNotMet( + errors::PreconditionNotMet( "Context place error, excepted GPUPlace, but actually %s.", ctx_place)); auto ctx_gpu_place = ctx_place; PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place, - phi::errors::Unavailable( + errors::Unavailable( "Source place and context place do not match, source " "place is %s, context place is %s.", src_gpu_place, @@ -259,7 +262,7 @@ void Copy(const Context& dev_ctx, paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size, stream); #endif } else { - PADDLE_THROW(phi::errors::Unimplemented( + PADDLE_THROW(errors::Unimplemented( "Copy from %s to %s is not supported.", src_place, dst_place)); } } @@ -411,4 +414,12 @@ template void Copy(const CustomContext& dev_ctx, bool blocking, DenseTensor* dst); #endif + +#ifdef PADDLE_WITH_MKLDNN +template void Copy(const OneDNNContext& dev_ctx, + const DenseTensor& src, + Place dst_place, + bool blocking, + DenseTensor* dst); +#endif } // namespace phi diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/phi/kernels/funcs/norm_utils.cu.h similarity index 98% rename from paddle/fluid/operators/norm_utils.cu.h rename to paddle/phi/kernels/funcs/norm_utils.cu.h index 2412913995b95c..0971db10529a96 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/phi/kernels/funcs/norm_utils.cu.h @@ -24,8 +24,7 @@ limitations under the License. */ #include namespace cub = hipcub; #endif -#include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/common/layout.h" #include "paddle/phi/kernels/funcs/math_function.h" #ifdef __HIPCC__ @@ -34,8 +33,8 @@ namespace cub = hipcub; #define LAUNCH_BOUNDS(BlockDim) #endif -namespace paddle { -namespace operators { +namespace phi { +namespace funcs { using DataLayout = phi::DataLayout; @@ -464,7 +463,8 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, const int sample_size = num / N / C; phi::DenseTensor scale_tmp; if (!Scale) { - scale_tmp.mutable_data({C}, ctx.GetPlace()); + scale_tmp.Resize({C}); + ctx.template Alloc(&scale_tmp); set_constant(ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); @@ -495,7 +495,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, } if (dX) { - T *dx_data = dX->mutable_data(ctx.GetPlace()); + T *dx_data = ctx.template Alloc(dX); set_constant(ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { @@ -552,7 +552,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, } } if (dScale) { - T *dscale_data = dScale->mutable_data(ctx.GetPlace()); + T *dscale_data = ctx.template Alloc(dScale); set_constant(ctx, dScale, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { @@ -605,7 +605,7 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, } } if (ddY) { - T *ddy_data = ddY->mutable_data(ctx.GetPlace()); + T *ddy_data = ctx.template Alloc(ddY); set_constant(ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { @@ -670,5 +670,5 @@ void NormDoubleGradFunctor(const DeviceContext &ctx, } } } -} // namespace operators -} // namespace paddle +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h index 88bef61fa921ff..63202ca4a484d1 100644 --- a/paddle/phi/kernels/funcs/values_vectors_functor.h +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -13,10 +13,10 @@ // limitations under the License. #pragma once - #include "paddle/fluid/memory/memory.h" #ifdef PADDLE_WITH_CUDA #include "paddle/phi/backends/dynload/cusolver.h" +#include "paddle/phi/core/errors.h" #endif // PADDLE_WITH_CUDA #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h" @@ -54,6 +54,137 @@ static void CheckEighResult(const int batch, const int info) { info)); } +#ifdef PADDLE_WITH_CUDA + +#if CUDA_VERSION >= 11031 +static bool use_cusolver_syevj_batched = true; +#else +static bool use_cusolver_syevj_batched = false; +#endif + +#define CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(scalar_t, value_t) \ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, \ + int n, const scalar_t *A, int lda, const value_t *W, int *lwork, \ + syevjInfo_t params, int batchsize + +template +void syevjBatched_bufferSize( + CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(scalar_t, value_t)) { + PADDLE_THROW(phi::errors::InvalidArgument( + "syevjBatched_bufferSize: not implemented for %s", + typeid(scalar_t).name())); +} + +template <> +inline void syevjBatched_bufferSize( + CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(float, float)) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevjBatched_bufferSize( + handle, jobz, uplo, n, A, lda, W, lwork, params, batchsize)); +} + +template <> +inline void syevjBatched_bufferSize( + CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(double, double)) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevjBatched_bufferSize( + handle, jobz, uplo, n, A, lda, W, lwork, params, batchsize)); +} + +template <> +inline void syevjBatched_bufferSize, float>( + CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(phi::dtype::complex, + float)) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCheevjBatched_bufferSize( + handle, + jobz, + uplo, + n, + reinterpret_cast(A), + lda, + W, + lwork, + params, + batchsize)); +} + +template <> +inline void syevjBatched_bufferSize, double>( + CUDASOLVER_SYEVJ_BATCHED_BUFFERSIZE_ARGTYPES(phi::dtype::complex, + double)) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZheevjBatched_bufferSize( + handle, + jobz, + uplo, + n, + reinterpret_cast(A), + lda, + W, + lwork, + params, + batchsize)); +} + +#define CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(scalar_t, value_t) \ + cusolverDnHandle_t handle, cusolverEigMode_t jobz, cublasFillMode_t uplo, \ + int n, scalar_t *A, int lda, value_t *W, scalar_t *work, int lwork, \ + int *info, syevjInfo_t params, int batchsize + +template +void syevjBatched(CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(scalar_t, value_t)) { + PADDLE_THROW(phi::errors::InvalidArgument( + "syevjBatched: not implemented for %s", typeid(scalar_t).name())); +} + +template <> +inline void syevjBatched(CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(float, + float)) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevjBatched( + handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchsize)); +} + +template <> +inline void syevjBatched(CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(double, + double)) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevjBatched( + handle, jobz, uplo, n, A, lda, W, work, lwork, info, params, batchsize)); +} + +template <> +inline void syevjBatched, float>( + CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(phi::dtype::complex, float)) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnCheevjBatched(handle, + jobz, + uplo, + n, + reinterpret_cast(A), + lda, + W, + reinterpret_cast(work), + lwork, + info, + params, + batchsize)); +} + +template <> +inline void syevjBatched, double>( + CUDASOLVER_SYEVJ_BATCHED_ARGTYPES(phi::dtype::complex, double)) { + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnZheevjBatched( + handle, + jobz, + uplo, + n, + reinterpret_cast(A), + lda, + W, + reinterpret_cast(work), + lwork, + info, + params, + batchsize)); +} +#endif + #ifdef PADDLE_WITH_CUDA static void CheckEighResult(const GPUContext &dev_ctx, const int64_t batch_size, @@ -232,17 +363,33 @@ struct MatrixEighFunctor { DenseTensor input_trans = phi::TransposeLast2Dim(dev_ctx, input); T *input_vector = input_trans.data(); - // Once input data type is float32, and the last dimension of - // input is located in range [32, 512], Syevj works better. - bool use_syevj = (input.dtype() == phi::DataType::FLOAT32 && - values_stride >= 32 && values_stride <= 512); + // Precision loss will occur in some cases while using + // cusolverDnZheevjBatched to calculate in Paddle(cuda11.7) but it works + // well in Paddle(cuda10.2) + use_cusolver_syevj_batched = (use_cusolver_syevj_batched) && + (batch_size > 1) && + (input.dtype() != phi::DataType::COMPLEX128); + bool use_cusolver_syevj = (input.dtype() == phi::DataType::FLOAT32 && + last_dim >= 32 && last_dim <= 512); auto handle = dev_ctx.cusolver_dn_handle(); syevjInfo_t syevj_params; - if (use_syevj) { + if (use_cusolver_syevj_batched) { + PADDLE_ENFORCE_GPU_SUCCESS( + dynload::cusolverDnCreateSyevjInfo(&syevj_params)); + syevjBatched_bufferSize(handle, + jobz, + uplo, + last_dim, + input_vector, + lda, + out_value, + &workspace_size, + syevj_params, + batch_size); + } else if (use_cusolver_syevj) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnCreateSyevjInfo(&syevj_params)); - PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( dev_ctx.cusolver_dn_handle(), jobz, @@ -272,7 +419,21 @@ struct MatrixEighFunctor { for (auto i = 0; i < batch_size; ++i) { auto *input_data = input_vector + i * vector_stride; auto *value_data = out_value + i * values_stride; - if (use_syevj) { + if (use_cusolver_syevj_batched) { + syevjBatched(handle, + jobz, + uplo, + last_dim, + input_data, + lda, + value_data, + work_ptr, + workspace_size, + &info_ptr[i], + syevj_params, + batch_size); + break; + } else if (use_cusolver_syevj) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnSsyevj(handle, jobz, @@ -300,7 +461,7 @@ struct MatrixEighFunctor { } CheckEighResult(dev_ctx, batch_size, info_ptr); - if (use_syevj) { + if (use_cusolver_syevj_batched || use_cusolver_syevj) { PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnDestroySyevjInfo(syevj_params)); } diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu index cfad86506c9099..1c6d1debbabd93 100644 --- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/norm_utils.cu.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/common/layout.h" @@ -24,6 +23,7 @@ #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/norm_utils.cu.h" #include "paddle/phi/kernels/funcs/norm_utils.h" #include "paddle/phi/kernels/funcs/reduce_function.h" @@ -1352,24 +1352,23 @@ void BatchNormDoubleGradKernel( running_mean = mean.get_ptr(); running_variance = variance.get_ptr(); } - paddle::operators::NormDoubleGradFunctor( - ctx, - data_layout, - &x, - &scale, - &y_grad, - &saved_mean, - &saved_variance, - running_mean, - running_variance, - epsilon, - use_global_stats, - x_grad_grad.get_ptr(), - scale_grad_grad.get_ptr(), - bias_grad_grad.get_ptr(), - x_grad, - scale_grad, - y_grad_grad); + phi::funcs::NormDoubleGradFunctor(ctx, + data_layout, + &x, + &scale, + &y_grad, + &saved_mean, + &saved_variance, + running_mean, + running_variance, + epsilon, + use_global_stats, + x_grad_grad.get_ptr(), + scale_grad_grad.get_ptr(), + bias_grad_grad.get_ptr(), + x_grad, + scale_grad, + y_grad_grad); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu index d01397c1fa0665..01e4f08c29bdd5 100644 --- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -21,7 +21,6 @@ namespace cub = hipcub; #endif #include "paddle/fluid/operators/layout_utils.h" -#include "paddle/fluid/operators/norm_utils.cu.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/common/layout.h" diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu index c079b61c06e944..5cad80288bf691 100644 --- a/paddle/phi/kernels/gpu/stack_kernel.cu +++ b/paddle/phi/kernels/gpu/stack_kernel.cu @@ -18,30 +18,101 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/fast_divmod.h" namespace phi { -template -__global__ void StackCUDAKernel(T** input_ptrs, - IntType split_size, - IntType rows, - IntType cols, +template +struct DivmodWarpper { + public: + void SetDivden(IndexT dividen) { divmoder = phi::funcs::FastDivMod(dividen); } + __device__ inline phi::funcs::FastDivMod::DivModT div_mod(IndexT val) { + return divmoder.Divmod(val); + } + + private: + phi::funcs::FastDivMod divmoder; +}; + +template <> +struct DivmodWarpper { + public: + using DivModT = phi::AlignedVector; + + void SetDivden(int64_t dividen) { dividen_ = dividen; } + __device__ inline DivModT div_mod(int64_t val) { + DivModT data; + data[0] = val / dividen_; + data[1] = val - data[0] * dividen_; + return data; + } + + private: + int64_t dividen_; +}; + +constexpr int kWarpperSize = 64; +template +struct PointerArray : public DivmodWarpper { + public: + const T* data[kWarpperSize]; + PointerArray(const std::vector& x, + int num, + int64_t dividen) { + this->SetDivden(dividen); + for (auto i = 0; i < num; ++i) { + data[i] = x[i]->data(); + } + } +}; + +template +struct PointerToPointer : public DivmodWarpper { + public: + T** data; + PointerToPointer(const Context& ctx, + const std::vector& x, + int num, + int64_t dividen) { + this->SetDivden(dividen); + auto byte_len = num * sizeof(T*); + std::vector x_datas(num); + for (int i = 0; i < num; ++i) { + x_datas[i] = x[i]->data(); + } + auto tmp_x_data = paddle::memory::Alloc( + ctx.GetPlace(), + byte_len, + phi::Stream(reinterpret_cast(ctx.stream()))); + paddle::memory::Copy(ctx.GetPlace(), + tmp_x_data->ptr(), + phi::CPUPlace(), + reinterpret_cast(x_datas.data()), + x_datas.size() * sizeof(T*), + ctx.stream()); + data = reinterpret_cast(tmp_x_data->ptr()); + } +}; + +template +__global__ void StackCUDAKernel(WarpT input_warpper, + IndexT split_size, + IndexT rows, + IndexT cols, T* __restrict__ output) { - IntType grid_x = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; - IntType grid_x_stride = static_cast(blockDim.x) * gridDim.x; - IntType grid_y_stride = static_cast(blockDim.y) * gridDim.y; + IndexT grid_x = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + IndexT grid_x_stride = static_cast(blockDim.x) * gridDim.x; + IndexT grid_y_stride = static_cast(blockDim.y) * gridDim.y; for (; grid_x < cols; grid_x += grid_x_stride) { - IntType grid_y = - static_cast(blockIdx.y) * blockDim.y + threadIdx.y; + IndexT grid_y = static_cast(blockIdx.y) * blockDim.y + threadIdx.y; - IntType split = grid_x / split_size; - const T* input_ptr = input_ptrs[split]; - IntType col_offset = grid_x % split_size; + auto divmod_rslt = input_warpper.div_mod(grid_x); + const T* input_ptr = input_warpper.data[divmod_rslt[0]]; #pragma unroll for (; grid_y < rows; grid_y += grid_y_stride) { output[grid_y * cols + grid_x] = - input_ptr[grid_y * split_size + col_offset]; + input_ptr[grid_y * split_size + divmod_rslt[1]]; } } } @@ -52,24 +123,8 @@ void StackKernel(const Context& dev_ctx, int axis, DenseTensor* out) { if (axis < 0) axis += (x[0]->dims().size() + 1); - int n = static_cast(x.size()); T* y_data = dev_ctx.template Alloc(out); - std::vector x_datas(n); - for (int i = 0; i < n; i++) { - x_datas[i] = x[i]->data(); - } - - auto tmp_x_data = paddle::memory::Alloc( - dev_ctx.GetPlace(), - x_datas.size() * sizeof(T*), - phi::Stream(reinterpret_cast(dev_ctx.stream()))); - paddle::memory::Copy(dev_ctx.GetPlace(), - tmp_x_data->ptr(), - phi::CPUPlace(), - reinterpret_cast(x_datas.data()), - x_datas.size() * sizeof(T*), - dev_ctx.stream()); // Split x dim from axis to matrix int64_t x_row = 1, x_col = 1; @@ -78,33 +133,40 @@ void StackKernel(const Context& dev_ctx, } x_col = x[0]->numel() / x_row; int64_t out_col = x_col * n; - auto config = phi::backends::gpu::GetGpuLaunchConfig2D(dev_ctx, out_col, x_row); - if (out->numel() < std::numeric_limits::max()) { - StackCUDAKernel - <<>>(reinterpret_cast(tmp_x_data->ptr()), - static_cast(x_col), - static_cast(x_row), - static_cast(out_col), - y_data); +#define IMPL_STACK_CUDA_KERNEL(index_t, input_warpper) \ + StackCUDAKernel \ + <<>>(input_warpper, \ + static_cast(x_col), \ + static_cast(x_row), \ + static_cast(out_col), \ + y_data); + + bool use_int32 = out->numel() < std::numeric_limits::max(); + if (n <= kWarpperSize) { + if (use_int32) { + PointerArray ptr_array(x, n, x_col); + IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array); + } else { + PointerArray ptr_array(x, n, x_col); + IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array); + } } else { - StackCUDAKernel - <<>>(reinterpret_cast(tmp_x_data->ptr()), - x_col, - x_row, - out_col, - y_data); + if (use_int32) { + PointerToPointer ptr_array(dev_ctx, x, n, x_col); + IMPL_STACK_CUDA_KERNEL(int32_t, ptr_array); + } else { + PointerToPointer ptr_array(dev_ctx, x, n, x_col); + IMPL_STACK_CUDA_KERNEL(int64_t, ptr_array); + } } +#undef IMPL_STACK_CUDA_KERNEL } - } // namespace phi PD_REGISTER_KERNEL(stack, diff --git a/paddle/phi/kernels/onednn/reshape_kernel.cc b/paddle/phi/kernels/onednn/reshape_kernel.cc new file mode 100644 index 00000000000000..4d8adc4b9a6e1d --- /dev/null +++ b/paddle/phi/kernels/onednn/reshape_kernel.cc @@ -0,0 +1,179 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/backends/onednn/onednn_reuse.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +static DDim ValidateShape(const std::vector& shape, + const DDim& in_dims) { + const int64_t in_size = product(in_dims); + auto in_dims_vec = vectorize(in_dims); + bool all_positive = std::all_of(in_dims_vec.cbegin(), + in_dims_vec.cend(), + [](int64_t i) { return i > 0; }); + // only one dimension can be set to -1, whose size will be automatically + // infered + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE_EQ( + unk_dim_idx, + -1, + errors::InvalidArgument( + "Only one dimension value of 'shape' in ReshapeOp can " + "be -1. But received shape = [%s], shape[%d] is also -1.", + make_ddim(shape), + i)); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE_LT( + static_cast(i), + in_dims.size(), + errors::InvalidArgument( + "The index of 0 in `shape` must be less than " + "the input tensor X's dimensions. " + "But received shape = [%s], shape[%d] = 0, X's shape = [%s], " + "X's dimensions = %d.", + make_ddim(shape), + i, + in_dims, + in_dims.size())); + } else { + PADDLE_ENFORCE_GT( + shape[i], + 0, + errors::InvalidArgument( + "Each dimension value of 'shape' in ReshapeOp must not " + "be negative except one unknown dimension. " + "But received shape = [%s], shape[%d] = %d.", + make_ddim(shape), + i, + shape[i])); + } + + capacity *= (shape[i] ? shape[i] : in_dims[i]); + output_shape[i] = (shape[i] ? static_cast(shape[i]) : in_dims[i]); + } + + if (unk_dim_idx != -1) { + if (all_positive) { + // in_size < 0 and is un-determinate in compile time, skip the check, + // for example, in_dims = [-1, 8, 1, 1], shape = [-1, 3, 8], + // capacity = -24, in_size = -8, output_shape[0] = 0 + // the following check will fail. + output_shape[unk_dim_idx] = -in_size / capacity; + PADDLE_ENFORCE_EQ( + output_shape[unk_dim_idx] * capacity, + -in_size, + errors::InvalidArgument( + "The 'shape' attribute in ReshapeOp is invalid. " + "The input tensor X'size must be divisible by known " + "capacity of 'shape'. " + "But received X's shape = [%s], X's size = %d, " + "'shape' is [%s], known capacity of 'shape' is %d.", + in_dims, + in_size, + make_ddim(shape), + capacity)); + } else { + output_shape[unk_dim_idx] = -1; + } + } else { + if (all_positive) { + PADDLE_ENFORCE_EQ( + capacity, + in_size, + errors::InvalidArgument( + "The 'shape' in ReshapeOp is invalid. " + "The input tensor X'size must be equal to the capacity of " + "'shape'. " + "But received X's shape = [%s], X's size = %d, 'shape' is " + "[%s], the capacity of 'shape' is %d.", + in_dims, + in_size, + make_ddim(shape), + capacity)); + } + } + return make_ddim(output_shape); +} + +template +void ExecuteReshape(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& shape, + const DDim& x_dims, + DenseTensor* out) { + auto out_dims = ValidateShape(shape.GetData(), x_dims); + auto x_vec_dims = vectorize(x_dims); + + funcs::ReorderOneDNNHandler reorder_handler( + x_vec_dims, + x.dtype(), + funcs::ToOneDNNDataType(x.dtype()), + dev_ctx.GetEngine()); + + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + x.mem_desc(), funcs::to_void_cast(x.data())); + out->Resize(x_dims); // to match x numel, format is changed later + // reorder is done into a plain tag to allow usage with blocked formats + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + out, funcs::GetPlainOneDNNFormat(x_dims.size()), dev_ctx.GetPlace()); + auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p, + reorder_src_memory_p); + + auto& astream = OneDNNContext::tls().get_stream(); + reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); + + astream.wait(); + + out->Resize(out_dims); + out->set_mem_desc( + reorder_dst_memory_p->get_desc().reshape(vectorize(out_dims))); +} + +template +void ReshapeKernel(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& shape, + DenseTensor* out) { + auto x_dims = x.dims(); + ExecuteReshape(dev_ctx, x, shape, x_dims, out); +} + +template +void ReshapeWithXShape(const Context& dev_ctx, + const DenseTensor& x, + const IntArray& shape, + DenseTensor* out, + DenseTensor* xshape) { + auto x_dims = slice_ddim(xshape->dims(), 1, xshape->dims().size()); + ExecuteReshape(dev_ctx, x, shape, x_dims, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + reshape, OneDNN, ONEDNN, phi::ReshapeKernel, float, phi::dtype::bfloat16) {} + +PD_REGISTER_KERNEL(reshape_with_xshape, + OneDNN, + ONEDNN, + phi::ReshapeWithXShape, + float, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc index 64f1f9f610861b..dafbb75dc07ac5 100644 --- a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc @@ -13,8 +13,6 @@ // limitations under the License. #include "paddle/phi/kernels/transpose_grad_kernel.h" - -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" #include "paddle/phi/core/kernel_registry.h" @@ -24,16 +22,16 @@ void TransposeGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, const std::vector& axis, DenseTensor* x_grad) { - PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU, + PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == AllocationType::CPU, true, errors::PreconditionNotMet( - "Operator DNNL TransposeGrad must use CPUPlace")); + "oneDNN TransposeGrad kernel must use CPUPlace")); if (!x_grad) return; const auto& onednn_engine = dev_ctx.GetEngine(); if (axis.size() == 1) { - paddle::framework::TensorCopy(out_grad, out_grad.place(), x_grad); + Copy(dev_ctx, out_grad, out_grad.place(), false, x_grad); x_grad->set_mem_desc(out_grad.mem_desc()); return; } diff --git a/paddle/phi/kernels/onednn/transpose_kernel.cc b/paddle/phi/kernels/onednn/transpose_kernel.cc index 26c89197e0d7f4..a36d5e4493a549 100644 --- a/paddle/phi/kernels/onednn/transpose_kernel.cc +++ b/paddle/phi/kernels/onednn/transpose_kernel.cc @@ -13,7 +13,6 @@ // limitations under the License. #include "paddle/phi/kernels/transpose_kernel.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/phi/backends/onednn/onednn_reuse.h" #include "paddle/phi/core/kernel_registry.h" @@ -80,7 +79,7 @@ void TransposeKernel(const Context& dev_ctx, dev_ctx, const_cast(&x), x.mem_desc()); if (axis.size() == 1) { - paddle::framework::TensorCopy(x, x.place(), out); + Copy(dev_ctx, x, x.place(), false, out); out->set_mem_desc(x.mem_desc()); return; } diff --git a/paddle/phi/ops/compat/log_loss_sig.cc b/paddle/phi/ops/compat/log_loss_sig.cc deleted file mode 100644 index adf40bac000e3f..00000000000000 --- a/paddle/phi/ops/compat/log_loss_sig.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature LogLossGradOpArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("log_loss_grad", - {"Predicted", "Labels", "Loss@GRAD"}, - {"epsilon"}, - {"Predicted@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(log_loss_grad, phi::LogLossGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/put_along_axis_sig.cc b/paddle/phi/ops/compat/put_along_axis_sig.cc deleted file mode 100644 index 83f0e5f65a0c51..00000000000000 --- a/paddle/phi/ops/compat/put_along_axis_sig.cc +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature PutAlongAxisArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("put_along_axis", - {"Input", "Index", "Value"}, - {"Axis", "Reduce"}, - {"Result"}); -} - -KernelSignature PutAlongAxisGradArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("put_along_axis_grad", - {"Input", "Index", "Result@GRAD"}, - {"Axis", "Reduce"}, - {"Input@GRAD", "Value@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(put_along_axis, phi::PutAlongAxisArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(put_along_axis_grad, - phi::PutAlongAxisGradArgumentMapping); diff --git a/paddle/phi/ops/compat/svd_sig.cc b/paddle/phi/ops/compat/svd_sig.cc deleted file mode 100644 index 2b97d23f8b85f8..00000000000000 --- a/paddle/phi/ops/compat/svd_sig.cc +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature SvdGradOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("svd_grad", - {"X", "U", "VH", "S", "U@GRAD", "VH@GRAD", "S@GRAD"}, - {"full_matrices"}, - {"X@GRAD"}); -} -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(svd_grad, phi::SvdGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/take_along_axis_sig.cc b/paddle/phi/ops/compat/take_along_axis_sig.cc deleted file mode 100644 index a35c1c2db44800..00000000000000 --- a/paddle/phi/ops/compat/take_along_axis_sig.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/phi/core/compat/op_utils.h" - -namespace phi { - -KernelSignature TakeAlongAxisArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature( - "take_along_axis", {"Input", "Index"}, {"Axis"}, {"Result"}); -} - -KernelSignature TakeAlongAxisGradArgumentMapping( - const ArgumentMappingContext& ctx) { - return KernelSignature("take_along_axis_grad", - {"Input", "Index", "Result@GRAD"}, - {"Axis"}, - {"Input@GRAD"}); -} - -} // namespace phi - -PD_REGISTER_ARG_MAPPING_FN(take_along_axis, phi::TakeAlongAxisArgumentMapping); -PD_REGISTER_ARG_MAPPING_FN(take_along_axis_grad, - phi::TakeAlongAxisGradArgumentMapping); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 47e64afb16c57e..23895ef14c90f3 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -769,8 +769,8 @@ function run_linux_cpu_test() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build pip install hypothesis - if [ -d "${PADDLE_ROOT}/build/python/dist/" ]; then - pip install ${PADDLE_ROOT}/build/python/dist/*whl + if [ -d "${PADDLE_ROOT}/dist/" ]; then + pip install ${PADDLE_ROOT}/dist/*whl fi cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/testsuite.py ${PADDLE_ROOT}/build/python @@ -3463,6 +3463,12 @@ function check_coverage_build() { } function run_setup(){ rm -rf ${PADDLE_ROOT}/build + # Build script will not fail if *.deb does not exist + rm *.deb 2>/dev/null || true + # Delete previous built egg packages + rm -rf ${PADDLE_ROOT}/dist 2>/dev/null || true + # Delete previous built paddle cache + rm -rf ${PADDLE_ROOT}/build/python/paddle 2>/dev/null || true startTime_s=`date +%s` SYSTEM=`uname -s` @@ -3477,7 +3483,6 @@ function run_setup(){ export PYTHON_EXECUTABLE=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 export PYTHON_INCLUDE_DIR=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ export PYTHON_LIBRARY=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib - pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt else exit 1 @@ -3525,15 +3530,7 @@ function run_setup(){ else if [ "$1" != "" ]; then echo "using python abi: $1" - if [ "$1" == "cp36-cp36m" ]; then - export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} - export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} - #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export - export PYTHON_EXECUTABLE=/opt/_internal/cpython-3.6.0/bin/python3 - export PYTHON_INCLUDE_DIR=/opt/_internal/cpython-3.6.0/include/python3.6m - export PYTHON_LIBRARIES=/opt/_internal/cpython-3.6.0/lib/libpython3.so - pip3.6 install -r ${PADDLE_ROOT}/python/requirements.txt - elif [ "$1" == "cp37-cp37m" ]; then + if [ "$1" == "cp37-cp37m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} #after changing "PYTHON_LIBRARY:FILEPATH" to "PYTHON_LIBRARY" ,we can use export @@ -3651,7 +3648,7 @@ function run_setup(){ # reset ccache zero stats for collect PR's actual hit rate ccache -z - python setup.py install;build_error=$? + python setup.py $2;build_error=$? # ci will collect ccache hit rate collect_ccache_hits @@ -3871,7 +3868,7 @@ function main() { build_mac ;; cicheck_py37) - cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} + run_setup ${PYTHON_ABI:-""} bdist_wheel run_linux_cpu_test ${PYTHON_ABI:-""} ${PROC_RUN:-1} ;; test_cicheck_py37) @@ -3884,7 +3881,7 @@ function main() { parallel_test ;; build_gpubox) - run_setup ${PYTHON_ABI:-""} + run_setup ${PYTHON_ABI:-""} install ;; check_xpu) cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py index c12381c894e794..d845f3b78c6345 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py @@ -220,7 +220,8 @@ def unscale_method(self, optimizer): temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool_)) temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool_)) - device = "cpu" if optimizer.offload else "gpu" + device = paddle.get_device().split(":")[0] + device = "cpu" if optimizer.offload else device dev_id = ( 0 if device == "cpu" else int(paddle.get_device().split(":")[1]) ) @@ -245,8 +246,9 @@ def unscale_method(self, optimizer): is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") paddle.distributed.all_reduce( - is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None + is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None ) + self._found_inf = is_found_inf.numpy()[0] scaler._unscale = MethodType(unscale_method, scaler) diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 721d10d76b1286..1f644147a209bd 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -252,14 +252,26 @@ def check_models(models): ) +def _is_valid_optimizer(optimizer): + from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import ( + DygraphShardingOptimizer, + ) + + return isinstance( + optimizer, + ( + paddle.optimizer.Optimizer, + paddle.fluid.optimizer.Optimizer, + DygraphShardingOptimizer, + ), + ) + + def check_optimizers(optimizers): for optimizer in optimizers: - if not isinstance( - optimizer, - (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer), - ): + if not _is_valid_optimizer(optimizer): raise RuntimeError( - "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer, but receive {}.".format( + "Current train mode is pure fp16, optimizers should be paddle.optimizer.Optimizer or paddle.fluid.optimizer.Optimizer or DygraphShardingOptimizer, but receive {}.".format( type(optimizer) ) ) @@ -477,6 +489,20 @@ def __call__(self, state_dict): state_dict[key] = param_applied +def _set_multi_precision(optimizer, multi_precision): + from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.dygraph_sharding_optimizer import ( + DygraphShardingOptimizer, + ) + + optimizer = ( + optimizer._inner_optimizer + if isinstance(optimizer, DygraphShardingOptimizer) + else optimizer + ) + if hasattr(optimizer, "_multi_precision"): + optimizer._multi_precision = multi_precision + + @dygraph_only def amp_decorate( models, @@ -582,10 +608,7 @@ def amp_decorate( if optimizers is not None: # check optimizers optimizers_is_list = False - if isinstance( - optimizers, - (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer), - ): + if _is_valid_optimizer(optimizers): optimizers_is_list = False optimizers = [optimizers] check_optimizers(optimizers) @@ -596,13 +619,10 @@ def amp_decorate( raise TypeError( "optimizers must be either a single optimizer or a list of optimizers." ) - # supprot master_weight - for idx_opt in range(len(optimizers)): - if hasattr(optimizers[idx_opt], '_multi_precision'): - if master_weight is False: - optimizers[idx_opt]._multi_precision = False - else: - optimizers[idx_opt]._multi_precision = True + # support master_weight + use_multi_precision = not (master_weight is False) + for opt in optimizers: + _set_multi_precision(opt, use_multi_precision) if save_dtype is not None: if not (save_dtype in ['float16', 'bfloat16', 'float32', 'float64']): diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index 9515a8bd1704e5..d3e91295d43a29 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -83,9 +83,10 @@ def save_dygraph(state_dict, model_path): .. code-block:: python import paddle.fluid as fluid + import paddle with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) state_dict = emb.state_dict() fluid.save_dygraph( state_dict, "paddle_dy") diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py index 77d4f2c2573f94..cf794ad4cef899 100644 --- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py +++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py @@ -170,10 +170,11 @@ class PiecewiseDecay(LearningRateDecay): .. code-block:: python import paddle.fluid as fluid + import paddle boundaries = [10000, 20000] values = [1.0, 0.5, 0.1] with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding( [10, 10] ) + emb = paddle.nn.Embedding(10, 10) optimizer = fluid.optimizer.SGD( learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0), parameter_list = emb.parameters() ) @@ -240,9 +241,10 @@ class NaturalExpDecay(LearningRateDecay): .. code-block:: python import paddle.fluid as fluid + import paddle base_lr = 0.1 with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.dygraph.NaturalExpDecay( learning_rate=base_lr, @@ -403,9 +405,10 @@ class InverseTimeDecay(LearningRateDecay): .. code-block:: python import paddle.fluid as fluid + import paddle base_lr = 0.1 with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.dygraph.InverseTimeDecay( learning_rate=base_lr, @@ -487,11 +490,12 @@ class PolynomialDecay(LearningRateDecay): .. code-block:: python import paddle.fluid as fluid + import paddle start_lr = 0.01 total_step = 5000 end_lr = 0 with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding( [10, 10]) + emb = paddle.nn.Embedding(10, 10) optimizer = fluid.optimizer.SGD( learning_rate = fluid.dygraph.PolynomialDecay( start_lr, total_step, end_lr, power=1.0), @@ -639,10 +643,11 @@ class NoamDecay(LearningRateDecay): .. code-block:: python import paddle.fluid as fluid + import paddle warmup_steps = 100 learning_rate = 0.01 with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) optimizer = fluid.optimizer.SGD( learning_rate = fluid.dygraph.NoamDecay( 1/(warmup_steps *(learning_rate ** 2)), diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index f0b761fff82905..77436e9293d644 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -51,7 +51,6 @@ __all__ = [ 'BatchNorm', - 'Embedding', ] @@ -360,187 +359,6 @@ def forward(self, input): return self._helper.append_activation(batch_norm_out, self._act) -class Embedding(layers.Layer): - r""" - :alias_main: paddle.nn.Embedding - :alias: paddle.nn.Embedding,paddle.nn.layer.Embedding,paddle.nn.layer.common.Embedding - :old_api: paddle.fluid.dygraph.Embedding - - **Embedding Layer** - - This interface is used to construct a callable object of the ``Embedding`` class. - For specific usage, refer to code examples. It implements the function of the Embedding Layer. - This layer is used to lookup embeddings vector of ids provided by :attr:`input` . - It automatically constructs a 2D embedding matrix based on the - input :attr:`size` (vocab_size, emb_size) and :attr:`dtype` . - - The shape of output Tensor is generated by appending an emb_size dimension to the - last dimension of the input Tensor shape. - - **Note:** The id in :attr:`input` must satisfy :math:`0 =< id < size[0]` , - otherwise the program will throw an exception and exit. - - .. code-block:: text - - Case 1: - - input is a Tensor. padding_idx = -1 - input.data = [[1, 3], [2, 4], [4, 127] - input.shape = [3, 2] - Given size = [128, 16] - output is a Tensor: - out.shape = [3, 2, 16] - out.data = [[[0.129435295, 0.244512452, ..., 0.436322452], - [0.345421456, 0.524563927, ..., 0.144534654]], - - [[0.345249859, 0.124939536, ..., 0.194353745], - [0.945345345, 0.435394634, ..., 0.435345365]], - - [[0.945345345, 0.435394634, ..., 0.435345365], - [0.0, 0.0, ..., 0.0 ]]] # padding data - The input padding_idx is less than 0, it is automatically converted to padding_idx = -1 + 128 = 127 - It will pad all-zero data when ids is 127. - - Parameters: - size(tuple|list): The shape of the look up table parameter. It should have two elements which indicate the size - of the dictionary of embeddings and the size of each embedding vector respectively. - is_sparse(bool): The flag indicating whether to use sparse update. This parameter only - affects the performance of the backwards gradient update. It is recommended to set - True because sparse update is faster. But some optimizer does not support sparse update, - such as :ref:`api_fluid_optimizer_AdadeltaOptimizer` , :ref:`api_fluid_optimizer_AdamaxOptimizer` , - :ref:`api_fluid_optimizer_DecayedAdagradOptimizer` , :ref:`api_fluid_optimizer_FtrlOptimizer` , - :ref:`api_fluid_optimizer_LambOptimizer` and :ref:`api_fluid_optimizer_LarsMomentumOptimizer` . - In these case, is_sparse must be False. Default: False. - is_distributed(bool): Whether to store the embedding matrix in a distributed manner. Only used - in multi-machine distributed CPU training. Default: False. - padding_idx(int|long|None): padding_idx needs to be in the interval [-vocab_size, vocab_size). - If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted - to :math:`vocab\_size + padding\_idx` . It will output all-zero padding data whenever lookup - encounters :math:`padding\_idx` in id. And the padding data will not be updated while training. - If set None, it makes no effect to output. Default: None. - param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the - default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . In addition, - user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter. - The local word vector needs to be transformed into numpy format, and the shape of local word - vector should be consistent with :attr:`size` . Then :ref:`api_fluid_initializer_NumpyArrayInitializer` - is used to load custom or pre-trained word vectors. See code example 2 for details. - dtype(np.dtype|core.VarDesc.VarType|str): It refers to the data type of output Tensor. - It must be "float32" or "float64". Default: "float32". - - Attribute: - **weight** (Parameter): the learnable weights of this layer. - - Returns: - Variable: Embedding Tensor or LoDTensor mapped by input. The data type is the same as :attr:`dtype` . - - Examples: - - .. code-block:: python - - import paddle.fluid as fluid - import paddle.fluid.dygraph.base as base - import numpy as np - - # example 1 - inp_word = np.array([[2, 3, 5], [4, 2, 1]]).astype('int64') - inp_word.shape # [2, 3] - dict_size = 20 - with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding( - size=[dict_size, 32], - param_attr='emb.w', - is_sparse=False) - static_rlt3 = emb(base.to_variable(inp_word)) - static_rlt3.shape # [2, 3, 32] - - # example 2: load custom or pre-trained word vectors - weight_data = np.random.random(size=(128, 100)) # word vectors with numpy format - w_param_attrs = fluid.ParamAttr( - name="emb_weight", - learning_rate=0.5, - initializer=fluid.initializer.NumpyArrayInitializer(weight_data), - trainable=True) - with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding( - size=[128, 100], - param_attr= w_param_attrs, - is_sparse=False) - static_rlt3 = emb(base.to_variable(inp_word)) - """ - - def __init__( - self, - size, - is_sparse=False, - is_distributed=False, - padding_idx=None, - param_attr=None, - dtype='float32', - ): - super().__init__() - self._size = size - self._is_sparse = is_sparse - self._is_distributed = is_distributed - self._padding_idx = ( - -1 - if padding_idx is None - else padding_idx - if padding_idx >= 0 - else (size[0] + padding_idx) - ) - - self._param_attr = param_attr - self._dtype = dtype - self._remote_prefetch = self._is_sparse and (not self._is_distributed) - if self._remote_prefetch: - assert self._is_sparse is True and self._is_distributed is False - - self.weight = self.create_parameter( - attr=self._param_attr, - shape=self._size, - dtype=self._dtype, - is_bias=False, - ) - - def forward(self, input): - if _non_static_mode(): - return _legacy_C_ops.lookup_table_v2( - self.weight, - input, - 'is_sparse', - self._is_sparse, - 'is_distributed', - self._is_distributed, - 'remote_prefetch', - self._remote_prefetch, - 'padding_idx', - self._padding_idx, - ) - - check_variable_and_dtype( - input, - 'input', - ['uint8', 'int8', 'int16', 'int32', 'int64'], - 'Embedding', - ) - attrs = { - 'is_sparse': self._is_sparse, - 'is_distributed': self._is_distributed, - 'remote_prefetch': self._remote_prefetch, - 'padding_idx': self._padding_idx, - } - - out = self._helper.create_variable_for_type_inference(self._dtype) - self._helper.append_op( - type='lookup_table_v2', - inputs={'Ids': input, 'W': self.weight}, - outputs={'Out': out}, - attrs=attrs, - ) - - return out - - class RowConv(layers.Layer): """ ***Row-convolution operator*** diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py index cb030f71a45bc5..3432baf442e2ad 100644 --- a/python/paddle/fluid/dygraph/parallel.py +++ b/python/paddle/fluid/dygraph/parallel.py @@ -723,10 +723,6 @@ def init_reducer(self): def check_layer_sparse(sublayer): if isinstance(sublayer, paddle.nn.layer.common.Embedding): return sublayer._sparse - # NOTE(shenliang03):This is for compatibility. If paddle.fluid.dygraph.Embedding - # is removed in the future, the check will also be removed here. - if isinstance(sublayer, paddle.fluid.dygraph.Embedding): - return sublayer._is_sparse return False is_sparse_gradient = [ @@ -875,8 +871,8 @@ def state_dict( dist.init_parallel_env() - emb = fluid.dygraph.Embedding([10, 10]) - emb = fluid.dygraph.DataParallel(emb) + emb = paddle.nn.Embedding(10, 10) + emb = paddle.fluid.dygraph.DataParallel(emb) state_dict = emb.state_dict() paddle.save(state_dict, "paddle_dy.pdparams") @@ -910,7 +906,7 @@ def set_state_dict(self, state_dict, use_structured_name=True): dist.init_parallel_env() emb = paddle.nn.Embedding(10, 10) - emb = fluid.dygraph.DataParallel(emb) + emb = paddle.fluid.dygraph.DataParallel(emb) state_dict = emb.state_dict() paddle.save(state_dict, "paddle_dy.pdparams") diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index da4f609c401ac3..7393c6104f38ed 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1660,10 +1660,11 @@ def gradient(self): # example2: return tuple of ndarray with fluid.dygraph.guard(): - embedding = fluid.dygraph.Embedding( - size=[20, 32], - param_attr='emb.w', - is_sparse=True) + embedding = paddle.nn.Embedding( + 20, + 32, + weight_attr='emb.w', + sparse=True) x_data = np.arange(12).reshape(4, 3).astype('int64') x_data = x_data.reshape((-1, 3, 1)) x = fluid.dygraph.base.to_variable(x_data) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 71167f30e026a6..42c57193941f30 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -214,9 +214,10 @@ def state_dict(self): .. code-block:: python import paddle.fluid as fluid + import paddle with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) adam = fluid.optimizer.Adam(0.001, parameter_list=emb.parameters()) state_dict = adam.state_dict() @@ -582,7 +583,7 @@ def current_step_lr(self): # example1: LearningRateDecay is not used, return value is all the same with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) adam = fluid.optimizer.Adam(0.001, parameter_list = emb.parameters()) lr = adam.current_step_lr() print(lr) # 0.001 diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py index f63cfc089ed8a5..ef85aab80f6c95 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_save_for_auto_infer.py @@ -39,7 +39,6 @@ from paddle.distributed.sharding.group_sharded import group_sharded_parallel from paddle.distributed.utils.log_utils import get_logger from paddle.fluid.dataloader.dataset import IterableDataset -from paddle.fluid.dygraph.nn import Embedding from paddle.incubate.distributed.utils.io import save_for_auto_inference from paddle.nn import Linear @@ -131,7 +130,7 @@ def __init__( bias_attr=None, ): super(MLP, self).__init__() - self.embedding = Embedding((embedding_size, linear_size)) + self.embedding = paddle.nn.Embedding(embedding_size, linear_size) self._linear1 = Linear(linear_size, linear_size) self._linear2 = Linear(linear_size, linear_size) self._linear3 = Linear(linear_size, 10) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py index 48ec09552d7f98..deaf9779d44f60 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_control_flow_different.py @@ -18,7 +18,6 @@ import paddle import paddle.fluid as fluid import paddle.nn.functional as F -from paddle.fluid.dygraph.nn import Embedding paddle.seed(123) np.random.seed(2021) @@ -29,10 +28,10 @@ def __init__(self, hidden_size, vocab_size, is_sparse=False): super().__init__() self.hidden_size = hidden_size self.vocab_size = vocab_size - self.embedding = Embedding( - size=[self.vocab_size, self.hidden_size], - dtype='float32', - is_sparse=is_sparse, + self.embedding = paddle.nn.Embedding( + self.vocab_size, + self.hidden_size, + sparse=is_sparse, ) self.lin_a = paddle.nn.Linear(self.hidden_size, self.vocab_size) diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py index 8d49434ac54e8b..a8ddeb0bfdbede 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py @@ -18,7 +18,7 @@ import paddle import paddle.fluid as fluid import paddle.nn.functional as F -from paddle.fluid.dygraph import Embedding, Layer, to_variable +from paddle.fluid.dygraph import Layer, to_variable from paddle.optimizer.lr import NoamDecay """ @@ -513,11 +513,11 @@ def __init__( self._src_emb_dim = src_emb_dim self._src_vocab_size = src_vocab_size self._dropout_rate = dropout_rate - self._input_emb = Embedding( - size=[src_vocab_size, src_emb_dim], - is_sparse=is_sparse, - padding_idx=0, - param_attr=fluid.ParamAttr( + self._input_emb = paddle.nn.Embedding( + src_vocab_size, + src_emb_dim, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( name=word_emb_param_name, initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5), ), @@ -527,10 +527,11 @@ def __init__( pos_inp = pos_inp1 else: pos_inp = pos_inp2 - self._pos_emb = Embedding( - size=[self._src_max_len, src_emb_dim], - is_sparse=is_sparse, - param_attr=fluid.ParamAttr( + self._pos_emb = paddle.nn.Embedding( + self._src_max_len, + src_emb_dim, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( name=pos_enc_param_name, initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), trainable=False, diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py index d30466d9fc957b..2af1f4adec9bb8 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision.py @@ -344,7 +344,7 @@ def nan_inf(self): scaled_loss = scaler.scale(loss) scaled_loss.backward() optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) - self.assertEqual(scaler._found_inf.numpy() == 1, True) + self.assertEqual(scaler._found_inf.numpy() >= 1, True) for param in model.parameters(): # param not update when tensor contains nan or inf diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py index f688d28b856031..b5d36dfebaad4a 100644 --- a/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py +++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_imperative_auto_mixed_precision_for_eager.py @@ -343,7 +343,7 @@ def nan_inf(self): scaled_loss = scaler.scale(loss) scaled_loss.backward() optimize_ops, params_grads = scaler.minimize(optimizer, scaled_loss) - self.assertEqual(scaler._found_inf.numpy() == 1, True) + self.assertEqual(scaler._found_inf.numpy() >= 1, True) for param in model.parameters(): # param not update when tensor contains nan or inf diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py index 43f7f0f6d2b5e1..a6e4f09564dfa0 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_dygraph_model.py @@ -16,7 +16,7 @@ import paddle import paddle.fluid as fluid -from paddle.fluid.dygraph import Embedding, Layer +from paddle.fluid.dygraph import Layer from paddle.jit.api import declarative from paddle.nn import Linear @@ -208,29 +208,29 @@ def __init__(self, config, return_pooled_out=True, use_fp16=False): self._param_initializer = fluid.initializer.TruncatedNormal( scale=config['initializer_range'] ) - - self._src_emb = Embedding( - size=[self._voc_size, self._emb_size], - param_attr=fluid.ParamAttr( + paddle.set_default_dtype(self._dtype) + self._src_emb = paddle.nn.Embedding( + self._voc_size, + self._emb_size, + weight_attr=fluid.ParamAttr( name=self._word_emb_name, initializer=self._param_initializer ), - dtype=self._dtype, ) - self._pos_emb = Embedding( - size=[self._max_position_seq_len, self._emb_size], - param_attr=fluid.ParamAttr( + self._pos_emb = paddle.nn.Embedding( + self._max_position_seq_len, + self._emb_size, + weight_attr=fluid.ParamAttr( name=self._pos_emb_name, initializer=self._param_initializer ), - dtype=self._dtype, ) - self._sent_emb = Embedding( - size=[self._sent_types, self._emb_size], - param_attr=fluid.ParamAttr( + self._sent_emb = paddle.nn.Embedding( + self._sent_types, + self._emb_size, + weight_attr=fluid.ParamAttr( name=self._sent_emb_name, initializer=self._param_initializer ), - dtype=self._dtype, ) self.pooled_fc = Linear( diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py index bbca449bde67a3..eceba1198fa474 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py @@ -21,8 +21,8 @@ from paddle.fluid import ParamAttr, layers from paddle.fluid.dygraph import Layer from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import Embedding from paddle.jit.api import declarative +from paddle.nn import Embedding INF = 1.0 * 1e5 alpha = 0.6 @@ -122,16 +122,18 @@ def __init__( forget_bias = 1.0 self.src_embeder = Embedding( - size=[self.src_vocab_size, self.hidden_size], - param_attr=fluid.ParamAttr( + self.src_vocab_size, + self.hidden_size, + weight_attr=fluid.ParamAttr( initializer=uniform_initializer(init_scale) ), ) self.tar_embeder = Embedding( - size=[self.tar_vocab_size, self.hidden_size], - is_sparse=False, - param_attr=fluid.ParamAttr( + self.tar_vocab_size, + self.hidden_size, + sparse=False, + weight_attr=fluid.ParamAttr( initializer=uniform_initializer(init_scale) ), ) @@ -545,17 +547,19 @@ def __init__( forget_bias = 1.0 self.src_embeder = Embedding( - size=[self.src_vocab_size, self.hidden_size], - param_attr=fluid.ParamAttr( + self.src_vocab_size, + self.hidden_size, + weight_attr=fluid.ParamAttr( name='source_embedding', initializer=uniform_initializer(init_scale), ), ) self.tar_embeder = Embedding( - size=[self.tar_vocab_size, self.hidden_size], - is_sparse=False, - param_attr=fluid.ParamAttr( + self.tar_vocab_size, + self.hidden_size, + sparse=False, + weight_attr=fluid.ParamAttr( name='target_embedding', initializer=uniform_initializer(init_scale), ), diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py index d16f07d9a2e343..b6baf7ddf2913c 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py @@ -17,7 +17,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.param_attr as attr -from paddle.fluid.dygraph import Embedding, Layer +from paddle.fluid.dygraph import Layer from paddle.jit.api import declarative from paddle.static import Variable @@ -42,11 +42,12 @@ def ops(self): """ # TODO(huihuangzheng): The original code set the is_sparse=True, but it # causes crush in dy2stat. Set it to True after fixing it. - emb = Embedding( - size=[self.dict_size, self.emb_dim], - is_sparse=True, + emb = paddle.nn.Embedding( + self.dict_size, + self.emb_dim, + sparse=True, padding_idx=self.padding_idx, - param_attr=attr.ParamAttr( + weight_attr=attr.ParamAttr( name=self.name, initializer=fluid.initializer.Xavier() ), ) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py index 06f460912b45be..99fe330c692410 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model_v2.py @@ -38,11 +38,12 @@ def ops(self): """ # TODO(huihuangzheng): The original code set the is_sparse=True, but it # causes crush in dy2stat. Set it to True after fixing it. - emb = paddle.fluid.dygraph.Embedding( - size=[self.dict_size, self.emb_dim], - is_sparse=True, + emb = paddle.nn.Embedding( + self.dict_size, + self.emb_dim, + sparse=True, padding_idx=self.padding_idx, - param_attr=paddle.ParamAttr( + weight_attr=paddle.ParamAttr( name=self.name, initializer=paddle.nn.initializer.XavierUniform(), ), diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py index 7a5fbbc0842434..1ec320317d4c54 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py @@ -25,7 +25,7 @@ import paddle import paddle.fluid as fluid from paddle import _legacy_C_ops -from paddle.fluid.dygraph import Embedding, to_variable +from paddle.fluid.dygraph import to_variable from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX from paddle.fluid.framework import _non_static_mode from paddle.jit import ProgramTranslator @@ -371,10 +371,10 @@ def __init__(self, args, length=None): self.bigru_num = args.bigru_num self.init_bound = 0.1 - self.word_embedding = Embedding( - size=[self.vocab_size, self.word_emb_dim], - dtype='float32', - param_attr=fluid.ParamAttr( + self.word_embedding = paddle.nn.Embedding( + self.vocab_size, + self.word_emb_dim, + weight_attr=fluid.ParamAttr( learning_rate=self.emb_lr, name="word_emb", initializer=fluid.initializer.Uniform( diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py index 46970eaa27bb6c..49e7c32d6e3186 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py @@ -21,7 +21,6 @@ import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.optimizer import SGDOptimizer from paddle.jit import ProgramTranslator from paddle.jit.api import declarative @@ -156,11 +155,11 @@ def __init__( init_scale=init_scale, dropout=dropout, ) - self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=False, - param_attr=fluid.ParamAttr( + self.embedding = paddle.nn.Embedding( + vocab_size, + hidden_size, + sparse=False, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py index f589c37c2fbfd1..60712aeda7aac9 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py @@ -150,11 +150,11 @@ def __init__( init_scale=init_scale, dropout=dropout, ) - self.embedding = paddle.fluid.dygraph.nn.Embedding( - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=False, - param_attr=paddle.ParamAttr( + self.embedding = paddle.nn.Embedding( + vocab_size, + hidden_size, + sparse=False, + weight_attr=paddle.ParamAttr( name='embedding_para', initializer=paddle.nn.initializer.Uniform( low=-init_scale, high=init_scale diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py index 12e5099f257fe7..d9eb993f720707 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py @@ -20,10 +20,9 @@ import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import to_variable -from paddle.fluid.dygraph.nn import Embedding from paddle.jit import ProgramTranslator from paddle.jit.api import declarative -from paddle.nn import Linear +from paddle.nn import Embedding, Linear SEED = 2020 program_translator = ProgramTranslator() @@ -73,9 +72,9 @@ def __init__(self, dict_dim, batch_size, seq_len): self.batch_size = batch_size self.seq_len = seq_len self.embedding = Embedding( - size=[self.dict_dim + 1, self.emb_dim], - dtype='float32', - is_sparse=False, + self.dict_dim + 1, + self.emb_dim, + sparse=False, ) self._simple_conv_pool_1 = SimpleConvPool( self.channels, @@ -124,9 +123,9 @@ def __init__(self, dict_dim, batch_size, seq_len): self.batch_size = batch_size self.seq_len = seq_len self.embedding = Embedding( - size=[self.dict_dim + 1, self.emb_dim], - dtype='float32', - is_sparse=False, + self.dict_dim + 1, + self.emb_dim, + sparse=False, ) self._fc1 = Linear(self.hid_dim, self.hid_dim) self._fc2 = Linear(self.hid_dim, self.fc_hid_dim) @@ -167,10 +166,10 @@ def __init__(self, dict_dim, batch_size, seq_len): self.batch_size = batch_size self.seq_len = seq_len self.embedding = Embedding( - size=[self.dict_dim + 1, self.emb_dim], - dtype='float32', - param_attr=fluid.ParamAttr(learning_rate=30), - is_sparse=False, + self.dict_dim + 1, + self.emb_dim, + weight_attr=fluid.ParamAttr(learning_rate=30), + sparse=False, ) h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") h_0 = to_variable(h_0) @@ -213,10 +212,10 @@ def __init__(self, dict_dim, batch_size, seq_len): self.batch_size = batch_size self.seq_len = seq_len self.embedding = Embedding( - size=[self.dict_dim + 1, self.emb_dim], - dtype='float32', - param_attr=fluid.ParamAttr(learning_rate=30), - is_sparse=False, + self.dict_dim + 1, + self.emb_dim, + weight_attr=fluid.ParamAttr(learning_rate=30), + sparse=False, ) h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") h_0 = to_variable(h_0) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py index 13f1a9b882fed3..e546e26a2304f3 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py @@ -20,9 +20,9 @@ import paddle import paddle.fluid as fluid -from paddle.fluid.dygraph.nn import Embedding from paddle.jit import ProgramTranslator from paddle.jit.api import declarative +from paddle.nn import Embedding def fake_text(): @@ -227,9 +227,9 @@ def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1): self.embedding_size = embedding_size self.embedding = Embedding( - size=[self.vocab_size, self.embedding_size], - dtype='float32', - param_attr=fluid.ParamAttr( + self.vocab_size, + self.embedding_size, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-0.5 / self.embedding_size, @@ -239,9 +239,9 @@ def __init__(self, name_scope, vocab_size, embedding_size, init_scale=0.1): ) self.embedding_out = Embedding( - size=[self.vocab_size, self.embedding_size], - dtype='float32', - param_attr=fluid.ParamAttr( + self.vocab_size, + self.embedding_size, + weight_attr=fluid.ParamAttr( name='embedding_out_para', initializer=fluid.initializer.UniformInitializer( low=-0.5 / self.embedding_size, diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py index 88cc415b4bbab3..f8641dd2ac4f33 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py @@ -18,7 +18,7 @@ import paddle.fluid as fluid import paddle.fluid.layers as layers import paddle.nn.functional as F -from paddle.fluid.dygraph import Embedding, Layer, to_variable +from paddle.fluid.dygraph import Layer, to_variable from paddle.fluid.layers.utils import map_structure from paddle.jit.api import dygraph_to_static_func from paddle.nn import Linear @@ -276,10 +276,10 @@ def forward(self, enc_input, attn_bias): class Embedder(Layer): def __init__(self, vocab_size, emb_dim, bos_idx=0): super().__init__() - self.word_embedder = Embedding( - size=[vocab_size, emb_dim], - padding_idx=bos_idx, - param_attr=fluid.ParamAttr( + self.word_embedder = paddle.nn.Embedding( + vocab_size, + emb_dim, + weight_attr=fluid.ParamAttr( initializer=fluid.initializer.Normal(0.0, emb_dim**-0.5) ), ) @@ -311,9 +311,10 @@ def __init__( self.emb_dropout = prepostprocess_dropout self.emb_dim = d_model self.word_embedder = word_embedder - self.pos_encoder = Embedding( - size=[max_length, self.emb_dim], - param_attr=fluid.ParamAttr( + self.pos_encoder = paddle.nn.Embedding( + max_length, + self.emb_dim, + weight_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( position_encoding_init(max_length, self.emb_dim) ), @@ -499,9 +500,10 @@ def __init__( self.emb_dropout = prepostprocess_dropout self.emb_dim = d_model self.word_embedder = word_embedder - self.pos_encoder = Embedding( - size=[max_length, self.emb_dim], - param_attr=fluid.ParamAttr( + self.pos_encoder = paddle.nn.Embedding( + max_length, + self.emb_dim, + weight_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( position_encoding_init(max_length, self.emb_dim) ), diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py new file mode 100644 index 00000000000000..f83cffa3400a7d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_int64.py @@ -0,0 +1,239 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from functools import partial +from typing import Any, Dict, List + +import numpy as np +from program_config import ProgramConfig, TensorConfig +from trt_layer_auto_scan_test import TrtLayerAutoScanTest + +import paddle.inference as paddle_infer + + +class TrtInt64Test1(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + inputs = program_config.inputs + weights = program_config.weights + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + out_shape = list(inputs['input_data'].shape) + for x in range(len(attrs[0]["axes"])): + start = 0 + end = 0 + if attrs[0]["starts"][x] < 0: + start = ( + attrs[0]["starts"][x] + + inputs['input_data'].shape[attrs[0]["axes"][x]] + ) + else: + start = attrs[0]["starts"][x] + if attrs[0]["ends"][x] < 0: + end = ( + attrs[0]["ends"][x] + + inputs['input_data'].shape[attrs[0]["axes"][x]] + ) + else: + end = attrs[0]["ends"][x] + start = max(0, start) + end = max(0, end) + out_shape[attrs[0]["axes"][x]] = end - start + if start >= end: + return False + for x in attrs[0]["decrease_axis"]: + if x < 0: + return False + if out_shape[x] != 1: + return False + return True + + def sample_program_configs(self): + def generate_input1(attrs: List[Dict[str, Any]]): + return (10 * np.random.random([6, 6, 64, 64])).astype(np.int64) + + for axes in [[0, 1], [1, 3], [2, 3]]: + for starts in [[0, 1]]: + for ends in [[2, 2], [5, 5], [1, -1]]: + for decrease_axis in [[], [1], [2], [-1], [-100]]: + for infer_flags in [[-1]]: + dics = [ + { + "axes": axes, + "starts": starts, + "ends": ends, + "decrease_axis": decrease_axis, + "infer_flags": infer_flags, + } + ] + + ops_config = [ + { + "op_type": "slice", + "op_inputs": {"Input": ["input_data"]}, + "op_outputs": { + "Out": ["slice_output_data"] + }, + "op_attrs": dics[0], + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input1, dics) + ) + }, + outputs=["slice_output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [8, 8, 64, 64]} + self.dynamic_shape.opt_input_shape = {"input_data": [6, 6, 64, 64]} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-3 + + def test(self): + self.run_test() + + +class TrtInt64Test2(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input(shape, op_type): + return np.random.randint( + low=1, high=10000, size=shape, dtype=np.int64 + ) + + for shape in [[2, 32, 16], [1, 8, 16, 32]]: + for op_type in [ + "elementwise_add", + "elementwise_mul", + "elementwise_sub", + ]: + for axis in [0, -1]: + self.dims = len(shape) + dics = [{"axis": axis}] + ops_config = [ + { + "op_type": op_type, + "op_inputs": { + "X": ["input_data1"], + "Y": ["input_data2"], + }, + "op_outputs": {"Out": ["output_data"]}, + "op_attrs": dics[0], + } + ] + ops = self.generate_op_config(ops_config) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data1": TensorConfig( + data_gen=partial(generate_input, shape, op_type) + ), + "input_data2": TensorConfig( + data_gen=partial(generate_input, shape, op_type) + ), + }, + outputs=["output_data"], + ) + + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs): + if self.dims == 3: + self.dynamic_shape.min_input_shape = { + "input_data1": [1, 4, 4], + "input_data2": [1, 4, 4], + } + self.dynamic_shape.max_input_shape = { + "input_data1": [128, 128, 256], + "input_data2": [128, 128, 256], + } + self.dynamic_shape.opt_input_shape = { + "input_data1": [2, 32, 16], + "input_data2": [2, 32, 16], + } + elif self.dims == 4: + self.dynamic_shape.min_input_shape = { + "input_data1": [1, 4, 4, 4], + "input_data2": [1, 4, 4, 4], + } + self.dynamic_shape.max_input_shape = { + "input_data1": [8, 128, 64, 128], + "input_data2": [8, 128, 64, 128], + } + self.dynamic_shape.opt_input_shape = { + "input_data1": [2, 64, 32, 32], + "input_data2": [2, 64, 32, 32], + } + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 3 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 3), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 3), (1e-3, 1e-3) + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py index e1103c1d595c0c..11aacd02439e99 100644 --- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py +++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding.py @@ -18,7 +18,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import Embedding +from paddle.nn import Embedding class SimpleNet(fluid.Layer): @@ -37,10 +37,10 @@ def __init__( self.init_scale = init_scale self.num_steps = num_steps self.embedding = Embedding( - size=[self.vocab_size, self.hidden_size], - dtype=dtype, - is_sparse=is_sparse, - param_attr=fluid.ParamAttr( + self.vocab_size, + self.hidden_size, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale ) diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt index ee215ebf27a391..a9832154200435 100644 --- a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt @@ -5,49 +5,13 @@ file( string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}") foreach(target ${TEST_INTERP_CASES}) - py_test_modules( - ${target} - MODULES - ${target} - ENVS - FLAGS_host_trace_level=10 - FLAGS_static_executor_perfstat_filepath=./perfstat - FLAGS_allocator_strategy=auto_growth - FLAGS_use_stream_safe_cuda_allocator=true - FLAGS_fast_eager_deletion_mode=false - FLAGS_eager_delete_tensor_gb=0) - - py_test_modules( - ${target}_non_eager_deletion - MODULES - ${target} - ENVS - FLAGS_allocator_strategy=auto_growth - FLAGS_use_stream_safe_cuda_allocator=true - FLAGS_fast_eager_deletion_mode=false - FLAGS_eager_delete_tensor_gb=0.000001) - - py_test_modules( - ${target}_fast_gc - MODULES - ${target} - ENVS - FLAGS_allocator_strategy=auto_growth - FLAGS_use_stream_safe_cuda_allocator=true - FLAGS_fast_eager_deletion_mode=true - FLAGS_eager_delete_tensor_gb=0) - - py_test_modules( - ${target}_fast_gc_non_eager_deletion - MODULES - ${target} - ENVS - FLAGS_allocator_strategy=auto_growth - FLAGS_use_stream_safe_cuda_allocator=true - FLAGS_fast_eager_deletion_mode=true - FLAGS_eager_delete_tensor_gb=0.000001) + py_test_modules(${target} MODULES ${target}) endforeach() +py_test_modules( + test_standalone_executor_no_fast_gc MODULES test_standalone_executor ENVS + FLAGS_fast_eager_deletion_mode=false) + py_test_modules( test_standalone_executor_sequential_run MODULES test_standalone_executor ENVS FLAGS_new_executor_sequential_run=true) @@ -56,5 +20,8 @@ py_test_modules( test_standalone_executor_serial_run MODULES test_standalone_executor ENVS FLAGS_new_executor_serial_run=true) -py_test_modules(test_convert_graph_to_program MODULES test_standalone_executor - ENVS FLAGS_CONVERT_GRAPH_TO_PROGRAM=true) +py_test_modules( + test_standalone_executor_stats MODULES test_standalone_executor ENVS + FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat) + +set_tests_properties(test_standalone_cross_step_overlap PROPERTIES TIMEOUT 30) diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py new file mode 100644 index 00000000000000..a4fe9f9d258499 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/standalone_executor/test_standalone_cross_step_overlap.py @@ -0,0 +1,82 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import numpy as np + +import paddle +from paddle import static + +paddle.enable_static() + + +class TestCrossStepOverlap(unittest.TestCase): + def setUp(self): + self.shape = [16, 513, 513, 19] + self.x_value = 2 + self.y_value = 3 + self.overlap_op_num = 1500 + self.step_num = 3 + + def test_cross_step_overlap(self): + if not paddle.fluid.core.is_compiled_with_cuda(): + return + + # In this test case, z=x+y is calculated in the default stream, + # and at the same time, numerous reduce_min ops that output to y + # are executed in another stream (i.e., the custom stream). + # These reduce_min ops are carefully designed that their kernel + # calculation will overlap with the fill_constant kernels (output + # to x and y) in the next step, and therefore cross-step multi-stream + # synchronization is required. An Event should be recorded after the + # last reduce_min in the first step and waited before the fill_constant + # in the second step. Otherwise, the result of z will be wrong. + program = static.Program() + with static.program_guard(program): + x = paddle.full( + self.shape, fill_value=self.x_value, dtype='float64' + ) + y = paddle.full( + self.shape, fill_value=self.y_value, dtype='float64' + ) + z = paddle.add(x, y) + + block = program.global_block() + block.var(x.name).desc.set_persistable(True) + block.var(y.name).desc.set_persistable(True) + for i in range(self.overlap_op_num): + block.append_op( + type='reduce_min', + inputs={'X': x.name}, + outputs={'Out': y.name}, + attrs={'axis': 0, 'keepdim': True}, + ) + block.ops[-1].dist_attr.execution_stream = "custom" + + exe = static.Executor() + results = [] + for i in range(self.step_num): + result = exe.run(program, fetch_list=[z]) + results.append(result) + + for result in results: + self.assertAlmostEqual( + np.sum(result), + (self.x_value + self.y_value) * np.prod(self.shape), + ) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py index 5a301d3f0a5adc..e4dca329fe7c23 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_prune.py @@ -19,6 +19,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid.framework import _test_eager_guard +from paddle.nn import Embedding from paddle.tensor import random @@ -122,8 +123,8 @@ def forward(self, x, label, test_num): class MyLayer(fluid.Layer): def __init__(self, input_size, vocab_size, size, dtype="float32"): super().__init__(dtype=dtype) - self.embed0 = fluid.Embedding(size=(vocab_size, size)) - self.embed1 = fluid.Embedding(size=(vocab_size, size)) + self.embed0 = Embedding(vocab_size, size) + self.embed1 = Embedding(vocab_size, size) self.linear_0 = paddle.nn.Linear(input_size, size) self.linear_1 = paddle.nn.Linear(input_size, size) @@ -144,8 +145,8 @@ def embed_linear0(self, x): class MyLayer2(fluid.Layer): def __init__(self, input_size, vocab_size, size, dtype="float32"): super().__init__(dtype=dtype) - self.embed0 = fluid.Embedding(size=(vocab_size, size)) - self.embed1 = fluid.Embedding(size=(vocab_size, size)) + self.embed0 = Embedding(vocab_size, size) + self.embed1 = Embedding(vocab_size, size) self.linear_0 = paddle.nn.Linear(input_size, size) self.linear_1 = paddle.nn.Linear(input_size, size) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py index f864e2829046b7..2003e685327b8f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_load_static_param.py @@ -21,7 +21,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.framework as framework -from paddle.fluid.dygraph.nn import BatchNorm, Embedding +from paddle.fluid.dygraph.nn import BatchNorm from paddle.nn import Linear @@ -206,8 +206,8 @@ def __init__(self): self.batch_norm_1 = BatchNorm(10) self.batch_norm_2 = BatchNorm(10) - self.emb1 = Embedding([1000, 100]) - self.emb2 = Embedding([2000, 200]) + self.emb1 = paddle.nn.Embedding(1000, 100) + self.emb2 = paddle.nn.Embedding(2000, 200) self.layer_norm_1 = paddle.nn.LayerNorm([10]) self.layer_norm_2 = paddle.nn.LayerNorm(10) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py index fe706a78f8fe69..0027cbfa2a9bff 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py @@ -22,7 +22,6 @@ import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.framework import _test_eager_guard from paddle.fluid.optimizer import SGDOptimizer @@ -42,11 +41,12 @@ def __init__( self.vocab_size = vocab_size self.init_scale = init_scale self.num_steps = num_steps - self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype=dtype, - is_sparse=is_sparse, - param_attr=fluid.ParamAttr( + paddle.set_default_dtype(dtype) + self.embedding = paddle.nn.Embedding( + vocab_size, + hidden_size, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale diff --git a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py index faaa02ea46a5d0..0984104269c420 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_named_members.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_named_members.py @@ -101,7 +101,7 @@ def __init__(self): self.linear1 = paddle.nn.Linear(10, 10) self.linear2 = paddle.nn.Linear(5, 5) self.conv2d = paddle.nn.Conv2D(3, 2, 3) - self.embedding = fluid.dygraph.Embedding(size=[128, 16]) + self.embedding = paddle.nn.Embedding(128, 16) self.h_0 = fluid.dygraph.to_variable( np.zeros([10, 10]).astype('float32') ) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py index 1df0a4148c9c63..12118beaffe3b9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py @@ -21,7 +21,7 @@ import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import BatchNorm, Embedding +from paddle.fluid.dygraph.nn import BatchNorm from paddle.fluid.framework import _test_eager_guard from paddle.nn import Linear @@ -371,8 +371,8 @@ def __init__(self): Config.decoder_size, bias_attr=False, ) - self.embedding = Embedding( - [Config.num_classes + 2, Config.word_vector_dim], dtype='float32' + self.embedding = paddle.nn.Embedding( + Config.num_classes + 2, Config.word_vector_dim ) self.gru_decoder_with_attention = GRUDecoderWithAttention( Config.decoder_size, Config.num_classes diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index 2a59dd396f000b..6bbf0a70c2e347 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -23,9 +23,9 @@ import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.framework import _test_eager_guard from paddle.fluid.optimizer import SGDOptimizer +from paddle.nn import Embedding class SimpleLSTMRNN(fluid.Layer): @@ -172,10 +172,10 @@ def __init__( dropout=dropout, ) self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=is_sparse, - param_attr=fluid.ParamAttr( + vocab_size, + hidden_size, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index 2e30ea41a18cd2..4e30f591686dc1 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -22,9 +22,9 @@ import paddle.fluid.core as core from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay -from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.framework import _test_eager_guard from paddle.fluid.optimizer import Adam +from paddle.nn import Embedding class SimpleLSTMRNN(fluid.Layer): @@ -167,10 +167,10 @@ def __init__( dropout=dropout, ) self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=False, - param_attr=fluid.ParamAttr( + vocab_size, + hidden_size, + sparse=False, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale @@ -991,7 +991,7 @@ def func_testSetNumpyBeforeTrain(self): def func_testOnlyLoadParams(self): with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) state_dict = emb.state_dict() fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy')) @@ -1011,7 +1011,7 @@ def func_testOnlyLoadParams(self): def func_test_load_compatible_with_keep_name_table(self): with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) state_dict = emb.state_dict() fluid.save_dygraph(state_dict, os.path.join('saved_dy', 'emb_dy')) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index 4a3c6c64a6f6e5..a567a443e44859 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -23,8 +23,8 @@ import paddle.fluid.core as core from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.learning_rate_scheduler import LearningRateDecay -from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.framework import _test_eager_guard +from paddle.nn import Embedding from paddle.optimizer import Adam @@ -168,10 +168,10 @@ def __init__( dropout=dropout, ) self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype='float32', - is_sparse=False, - param_attr=fluid.ParamAttr( + vocab_size, + hidden_size, + sparse=False, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale @@ -1015,7 +1015,7 @@ def func_testSetNumpyBeforeTrain(self): def func_testOnlyLoadParams(self): with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) state_dict = emb.state_dict() paddle.save( state_dict, @@ -1028,7 +1028,7 @@ def func_testOnlyLoadParams(self): def func_test_no_state_in_input_dict(self): with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) state_dict = emb.state_dict() paddle.save( state_dict, @@ -1044,7 +1044,7 @@ def func_test_no_state_in_input_dict(self): def func_test_state_shape_mismatch(self): with fluid.dygraph.guard(): - emb = fluid.dygraph.Embedding([10, 10]) + emb = paddle.nn.Embedding(10, 10) state_dict = emb.state_dict() paddle.save( state_dict, diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py index 72c77e753f54b6..498317b2a33f9f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows.py @@ -27,11 +27,11 @@ class SimpleNet(paddle.nn.Layer): def __init__(self, vocab_size, hidden_size, dtype): super().__init__() - self.emb = fluid.dygraph.Embedding( - size=[vocab_size, hidden_size], - dtype=dtype, - param_attr='emb.w', - is_sparse=True, + self.emb = paddle.nn.Embedding( + vocab_size, + hidden_size, + weight_attr='emb.w', + sparse=True, ) def forward(self, input): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py index dd490e8d5553bb..220bde8e5b235f 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py @@ -22,9 +22,9 @@ import paddle.fluid.core as core import paddle.fluid.framework as framework from paddle.fluid.dygraph.base import to_variable -from paddle.fluid.dygraph.nn import Embedding from paddle.fluid.framework import _test_eager_guard from paddle.fluid.optimizer import SGDOptimizer +from paddle.nn import Embedding class SimpleNet(fluid.Layer): @@ -42,11 +42,12 @@ def __init__( self.vocab_size = vocab_size self.init_scale = init_scale self.num_steps = num_steps + paddle.set_default_dtype(dtype) self.embedding = Embedding( - size=[vocab_size, hidden_size], - dtype=dtype, - is_sparse=is_sparse, - param_attr=fluid.ParamAttr( + vocab_size, + hidden_size, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py index 3cc07ee6a3378b..f73e94363844cd 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py @@ -20,7 +20,7 @@ import paddle import paddle.fluid as fluid import paddle.nn.functional as F -from paddle.fluid import Embedding, Layer, core +from paddle.fluid import Layer, core from paddle.fluid.dygraph import guard, to_variable from paddle.fluid.framework import _in_legacy_dygraph, _test_eager_guard from paddle.nn import Linear @@ -664,11 +664,11 @@ def __init__( self._src_emb_dim = src_emb_dim self._src_vocab_size = src_vocab_size self._dropout_rate = dropout_rate - self._input_emb = Embedding( - size=[src_vocab_size, src_emb_dim], - is_sparse=is_sparse, - padding_idx=0, - param_attr=fluid.ParamAttr( + self._input_emb = paddle.nn.Embedding( + src_vocab_size, + src_emb_dim, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( name=word_emb_param_name, initializer=fluid.initializer.Normal(0.0, src_emb_dim**-0.5), ), @@ -678,10 +678,11 @@ def __init__( pos_inp = pos_inp1 else: pos_inp = pos_inp2 - self._pos_emb = Embedding( - size=[self._src_max_len, src_emb_dim], - is_sparse=is_sparse, - param_attr=fluid.ParamAttr( + self._pos_emb = paddle.nn.Embedding( + self._src_max_len, + src_emb_dim, + sparse=is_sparse, + weight_attr=fluid.ParamAttr( name=pos_enc_param_name, initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), trainable=False, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 01a9c098b0e913..83cec6d60443fa 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -26,7 +26,7 @@ import paddle.fluid.nets as nets import paddle.nn.functional as F from paddle.fluid import core -from paddle.fluid.dygraph import base, nn, to_variable +from paddle.fluid.dygraph import base, to_variable from paddle.fluid.framework import ( Program, _test_eager_guard, @@ -732,8 +732,8 @@ def test_embeding(self): )[0] with self.static_graph(): data_t = layers.data(name='word', shape=[1], dtype='int64') - emb2 = nn.Embedding( - size=[dict_size, 32], param_attr='emb.w', is_sparse=False + emb2 = paddle.nn.Embedding( + dict_size, 32, weight_attr='emb.w', sparse=False ) emb_rlt = emb2(data_t) static_rlt2 = self.get_static_graph_result( @@ -741,16 +741,17 @@ def test_embeding(self): )[0] with self.dynamic_graph(): with _test_eager_guard(): - emb2 = nn.Embedding( - size=[dict_size, 32], - param_attr='eager_emb.w', - is_sparse=False, + emb2 = paddle.nn.Embedding( + dict_size, + 32, + weight_attr='eager_emb.w', + sparse=False, ) dy_eager_rlt = emb2(base.to_variable(inp_word)) dy_eager_rlt_value = dy_eager_rlt.numpy() - emb2 = nn.Embedding( - size=[dict_size, 32], param_attr='emb.w', is_sparse=False + emb2 = paddle.nn.Embedding( + dict_size, 32, weight_attr='emb.w', sparse=False ) dy_rlt = emb2(base.to_variable(inp_word)) dy_rlt_value = dy_rlt.numpy() @@ -767,11 +768,12 @@ def test_embeding(self): custom_weight ) ) - emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False) - emb2 = nn.Embedding( - size=[dict_size, 32], - param_attr=weight_attr, - is_sparse=False, + emb1 = paddle.nn.Embedding(dict_size, 32, sparse=False) + emb2 = paddle.nn.Embedding( + dict_size, + 32, + weight_attr=weight_attr, + sparse=False, ) rep1 = emb1(base.to_variable(inp_word)) rep2 = emb2(base.to_variable(inp_word)) @@ -797,9 +799,9 @@ def test_embeding(self): custom_weight ) ) - emb1 = nn.Embedding(size=[dict_size, 32], is_sparse=False) - emb2 = nn.Embedding( - size=[dict_size, 32], param_attr=weight_attr, is_sparse=False + emb1 = paddle.nn.Embedding(dict_size, 32, sparse=False) + emb2 = paddle.nn.Embedding( + dict_size, 32, weight_attr=weight_attr, sparse=False ) rep1 = emb1(base.to_variable(inp_word)) rep2 = emb2(base.to_variable(inp_word)) diff --git a/python/paddle/fluid/tests/unittests/test_nan_inf.py b/python/paddle/fluid/tests/unittests/test_nan_inf.py index 289b5ec40f9d5f..4f0e02fdf613c0 100644 --- a/python/paddle/fluid/tests/unittests/test_nan_inf.py +++ b/python/paddle/fluid/tests/unittests/test_nan_inf.py @@ -17,9 +17,9 @@ import sys import unittest -import paddle +import numpy as np -paddle.enable_static() +import paddle class TestNanInf(unittest.TestCase): @@ -47,12 +47,7 @@ def check_nan_inf(self): print(err) # in python3, type(out+err) is 'bytes', need use encode - if paddle.fluid.core.is_compiled_with_cuda(): - assert (out + err).find('find_nan=1, find_inf=1'.encode()) != -1 - else: - assert (out + err).find( - 'There are `nan` or `inf` in tensor'.encode() - ) != -1 + assert (out + err).find('There are NAN or INF'.encode()) != -1 def test_nan_inf_in_static_mode(self): self._python_interp += " check_nan_inf_base.py" @@ -75,5 +70,97 @@ def setUp(self): ) +class TestNanInfCheckResult(unittest.TestCase): + def generate_inputs(self, shape, dtype="float32"): + data = np.random.random(size=shape).astype(dtype) + # [-10, 10) + x = (data * 20 - 10) * np.random.randint( + low=0, high=2, size=shape + ).astype(dtype) + y = np.random.randint(low=0, high=2, size=shape).astype(dtype) + return x, y + + def get_reference_num_nan_inf(self, x): + out = np.log(x) + num_nan = np.sum(np.isnan(out)) + num_inf = np.sum(np.isinf(out)) + print("[reference] num_nan={}, num_inf={}".format(num_nan, num_inf)) + return num_nan, num_inf + + def get_num_nan_inf(self, x_np, use_cuda=True, add_assert=False): + num_nan = 0 + num_inf = 0 + try: + if use_cuda: + paddle.device.set_device("gpu:0") + else: + paddle.device.set_device("cpu") + x = paddle.to_tensor(x_np) + out = paddle.log(x) + sys.stdout.flush() + if add_assert: + assert False + except Exception as e: + # Cannot catch the log in CUDA kernel. + err_str_list = ( + str(e) + .replace("(", " ") + .replace(")", " ") + .replace(",", " ") + .split(" ") + ) + for err_str in err_str_list: + if "num_nan" in err_str: + num_nan = int(err_str.split("=")[1]) + elif "num_inf" in err_str: + num_inf = int(err_str.split("=")[1]) + print("[paddle] num_nan={}, num_inf={}".format(num_nan, num_inf)) + return num_nan, num_inf + + def test_num_nan_inf(self): + def _check_num_nan_inf(use_cuda): + shape = [32, 32] + x_np, _ = self.generate_inputs(shape) + num_nan_np, num_inf_np = self.get_reference_num_nan_inf(x_np) + add_assert = (num_nan_np + num_inf_np) > 0 + num_nan, num_inf = self.get_num_nan_inf(x_np, use_cuda, add_assert) + if not use_cuda: + assert num_nan == num_nan_np and num_inf == num_inf_np + + paddle.set_flags( + {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 0} + ) + _check_num_nan_inf(use_cuda=False) + if paddle.fluid.core.is_compiled_with_cuda(): + _check_num_nan_inf(use_cuda=True) + + def check_nan_inf_level(self, use_cuda, dtype): + shape = [8, 8] + x_np, y_np = self.generate_inputs(shape, dtype) + + if use_cuda: + paddle.device.set_device("gpu:0") + else: + paddle.device.set_device("cpu") + x = paddle.to_tensor(x_np) + y = paddle.to_tensor(y_np) + out = paddle.log(x * 1e6) / y + + def test_check_nan_inf_level_float32(self): + paddle.set_flags( + {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 2} + ) + self.check_nan_inf_level(use_cuda=False, dtype="float32") + if paddle.fluid.core.is_compiled_with_cuda(): + self.check_nan_inf_level(use_cuda=True, dtype="float32") + + def test_check_nan_inf_level_float16(self): + paddle.set_flags( + {"FLAGS_check_nan_inf": 1, "FLAGS_check_nan_inf_level": 3} + ) + if paddle.fluid.core.is_compiled_with_cuda(): + self.check_nan_inf_level(use_cuda=True, dtype="float16") + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py index bfa6966e543b09..f990c2171b92e3 100644 --- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py +++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py @@ -689,9 +689,7 @@ def model_init( beam_size=4, max_step_num=20, ): - embedder = paddle.fluid.dygraph.Embedding( - size=[vocab_size, embed_dim], dtype="float64" - ) + embedder = paddle.nn.Embedding(vocab_size, embed_dim) output_layer = nn.Linear(hidden_size, vocab_size) cell = nn.LSTMCell(embed_dim, hidden_size) self.max_step_num = max_step_num diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 5413b5983cd19b..bdb88e7290b570 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -75,8 +75,8 @@ from ..fluid.layers import exponential_decay # noqa: F401 from .nn.common import batch_norm # noqa: F401 -from paddle.static.nn.metric import auc # noqa: F401 -from paddle.static.nn.metric import accuracy # noqa: F401 +from .nn.metric import auc # noqa: F401 +from .nn.metric import accuracy # noqa: F401 __all__ = [ # noqa 'append_backward', diff --git a/python/paddle/static/nn/metric.py b/python/paddle/static/nn/metric.py index 3ed54ddd2cceee..9b386c38c8e68a 100644 --- a/python/paddle/static/nn/metric.py +++ b/python/paddle/static/nn/metric.py @@ -22,7 +22,7 @@ from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layers import tensor -__all__ = ['accuracy', 'auc'] +__all__ = [] def accuracy(input, label, k=1, correct=None, total=None):