diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index b98a2288682660..08055cd9a5407b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -156,7 +156,7 @@ cc_test(test_seqpool_cvm_concat_fuse_pass SRCS seqpool_cvm_concat_fuse_pass_test cc_test(test_repeated_fc_relu_fuse_pass_cc SRCS repeated_fc_relu_fuse_pass_tester.cc DEPS repeated_fc_relu_fuse_pass framework_proto) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_simplify_with_basic_ops_pass SRCS simplify_with_basic_ops_pass_tester.cc DEPS simplify_with_basic_ops_pass) -cc_test(test_fc_elementwise_layernorm_fuse_pass SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass) +cc_test(test_fc_elementwise_layernorm_fuse_pass_cc SRCS fc_elementwise_layernorm_fuse_pass_tester.cc DEPS fc_elementwise_layernorm_fuse_pass) cc_test(test_skip_layernorm_fuse_pass SRCS skip_layernorm_fuse_pass_tester.cc DEPS skip_layernorm_fuse_pass) cc_test(test_multihead_matmul_fuse_pass SRCS multihead_matmul_fuse_pass_tester.cc DEPS multihead_matmul_fuse_pass) cc_test(test_conv_bn_fuse_pass_cc SRCS conv_bn_fuse_pass_tester.cc DEPS conv_bn_fuse_pass) diff --git a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc index d3cf3319adfc5e..0bf30c29f32793 100644 --- a/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_elementwise_layernorm_fuse_pass.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace framework { @@ -338,3 +339,9 @@ void FCElementwiseLayerNormFusePass::ApplyImpl(ir::Graph *graph) const { REGISTER_PASS(fc_elementwise_layernorm_fuse_pass, paddle::framework::ir::FCElementwiseLayerNormFusePass); +REGISTER_PASS_CAPABILITY(fc_elementwise_layernorm_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("fc", 0) + .LE("elementwise_add", 1) + .EQ("layer_norm", 0)); diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc new file mode 100644 index 00000000000000..9dcbbb9c9856e5 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/avg_shard_pass.h" + +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void AvgShardPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter AvgShardPass::ApplyImpl"; + + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetInstance(); + + if (ipu_backend->GetIpuStrategy()->need_avg_shard) { + VLOG(10) << "start AvgShardPass"; + auto nodes = ir::TopologySortOperations(*graph); + auto num_ipus = ipu_backend->GetIpuStrategy()->num_ipus; + + int shard_position = nodes.size() / num_ipus; + int index_and_stage = -1; + for (int i = 0; i < nodes.size(); i++) { + if ((i % shard_position) == 0 && index_and_stage < num_ipus - 1) { + index_and_stage++; + } + nodes[i]->Op()->SetAttr("ipu_index", index_and_stage); + nodes[i]->Op()->SetAttr("ipu_stage", index_and_stage); + } + VLOG(10) << "end AvgShardPass"; + } + + VLOG(10) << "leave AvgShardPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(avg_shard_pass, paddle::framework::ir::AvgShardPass); diff --git a/paddle/fluid/framework/ir/ipu/avg_shard_pass.h b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h new file mode 100644 index 00000000000000..b13acbd198dd52 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/avg_shard_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class AvgShardPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc new file mode 100644 index 00000000000000..5dcfddf6187f2b --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.cc @@ -0,0 +1,133 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h" + +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void ForwardGraphExtractPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter ForwardGraphExtractPass::ApplyImpl"; + + std::unordered_map> all_ops{ + {OpRole::kForward, {}}, {OpRole::kBackward, {}}, + {OpRole::kOptimize, {}}, {OpRole::kRPC, {}}, + {OpRole::kDist, {}}, {OpRole::kLRSched, {}}, + {OpRole::kLoss, {}}, {OpRole::kNotSpecified, {}}}; + for (auto* node : graph->Nodes()) { + if (!node->IsOp()) { + continue; + } + auto op_role = BOOST_GET_MUTABLE(int, node->Op()->GetAttr("op_role")); + if (op_role == static_cast(OpRole::kForward)) { + all_ops[OpRole::kForward].insert(node); + } else if (op_role == static_cast(OpRole::kBackward)) { + all_ops[OpRole::kBackward].insert(node); + } else if (op_role == static_cast(OpRole::kOptimize)) { + all_ops[OpRole::kOptimize].insert(node); + } else if (op_role == static_cast(OpRole::kRPC)) { + } else if (op_role == static_cast(OpRole::kDist)) { + } else if (op_role == static_cast(OpRole::kLRSched)) { + } else if (op_role == static_cast(OpRole::kLoss)) { + all_ops[OpRole::kLoss].insert(node); + } else if (op_role == static_cast(OpRole::kNotSpecified)) { + LOG(WARNING) << "Op: " << node->Name() << " OpRole is NotSpecified "; + } + } + + std::unordered_set forward_vars; + std::unordered_set backward_vars; + std::unordered_set control_vars; + // forward_vars + for (auto& nodes : std::array, 2>{ + all_ops[OpRole::kForward], all_ops[OpRole::kLoss]}) { + for (auto* node : nodes) { + for (auto* in_node : node->inputs) { + forward_vars.insert(in_node); + } + for (auto* out_node : node->outputs) { + forward_vars.insert(out_node); + } + } + } + // control_vars & backward_vars + for (auto* node : graph->Nodes()) { + if (!node->IsVar()) { + continue; + } + if (node->IsCtrlVar()) { + control_vars.insert(node); + } + for (auto* in_node : node->inputs) { + if (all_ops[OpRole::kOptimize].count(in_node)) { + backward_vars.insert(node); + } + } + } + // all removed node + std::unordered_set rm_nodes; + for (auto* node : graph->Nodes()) { + if (backward_vars.count(node)) { + rm_nodes.insert(node); + } else if (control_vars.count(node)) { + rm_nodes.insert(node); + } else if (all_ops[OpRole::kBackward].count(node)) { + rm_nodes.insert(node); + } else if (all_ops[OpRole::kForward].count(node) == 0 && + all_ops[OpRole::kLoss].count(node) == 0 && + forward_vars.count(node) == 0) { + rm_nodes.insert(node); + } else if (node->Name() == "feed" || node->Name() == "fetch") { + rm_nodes.insert(node); + } + } + + VLOG(10) << "Remove Node: "; + for (auto* node : rm_nodes) { + // rm node releations + for (auto* node_in : node->inputs) { + for (size_t i = 0; i < node_in->outputs.size(); ++i) { + if (node_in->outputs[i] == node) { + node_in->outputs.erase(node_in->outputs.begin() + i); + break; + } + } + } + for (auto* node_out : node->outputs) { + for (size_t i = 0; i < node_out->inputs.size(); ++i) { + if (node_out->inputs[i] == node) { + node_out->inputs.erase(node_out->inputs.begin() + i); + break; + } + } + } + VLOG(10) << "\t" << node->Name(); + graph->RemoveNode(node); + } + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + + VLOG(10) << "leave ForwardGraphExtractPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(forward_graph_extract_pass, + paddle::framework::ir::ForwardGraphExtractPass); diff --git a/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h new file mode 100644 index 00000000000000..afa9f1c15f2ab8 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/forward_graph_extract_pass.h @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ForwardGraphExtractPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc new file mode 100644 index 00000000000000..ceef27ac1ce3c0 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/infer_shape_pass.h" + +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/variable_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void InferShapePass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter InferShapePass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetInstance(); + auto batch_size = ipu_backend->GetIpuStrategy()->batch_size; + + auto feed_list = Get>("feed_list"); + for (auto node : graph->Nodes()) { + if (!node->IsVar()) { + continue; + } + bool is_feed = std::find(feed_list.begin(), feed_list.end(), + node->Name()) != feed_list.end(); + if (is_feed) { + auto input_shape = node->Var()->GetShape(); + if (input_shape[0] <= -1) { + input_shape[0] = batch_size; + node->Var()->SetShape(input_shape); + } + // int64->int32 + if (node->Var()->GetDataType() == proto::VarType::INT64) { + node->Var()->SetDataType(proto::VarType::INT32); + } + } + } + + // temp scope for shape inference + std::shared_ptr scope( + new paddle::framework::Scope()); + for (auto node : graph->Nodes()) { + if (!node->IsVar()) { + continue; + } + auto var_desc = node->Var(); + auto* ptr = scope->Var(var_desc->Name()); + paddle::framework::InitializeVariable(ptr, var_desc->GetType()); + + auto tensor = ptr->GetMutable(); + tensor->Resize(paddle::framework::make_ddim(var_desc->GetShape())); + } + + // infer shape + auto nodes = ir::TopologySortOperations(*graph); + for (auto node : nodes) { + auto op_desc = node->Op(); + auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); + paddle::framework::RuntimeContext ctx(op->Inputs(), op->Outputs(), *scope); + op->RuntimeInferShape(*scope, paddle::platform::CPUPlace(), ctx); + + for (auto it = ctx.outputs.begin(); it != ctx.outputs.end(); it++) { + for (int i = 0; i < it->second.size(); i++) { + auto output_name = op_desc->Output(it->first)[i]; + auto dim = + it->second[i]->GetMutable()->dims(); + auto new_shape = paddle::framework::vectorize(dim); + for (auto output_node : node->outputs) { + if (output_node->Name() == output_name) { + output_node->Var()->SetShape(new_shape); + } + } + } + } + } + // release the temp scope + scope.reset(); + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave InferShapePass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(infer_shape_pass, paddle::framework::ir::InferShapePass) + .RequirePassAttr("feed_list"); diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.h b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h new file mode 100644 index 00000000000000..3e8148b7f066d9 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class InferShapePass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc new file mode 100644 index 00000000000000..616139a52ac06c --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h" + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_strategy.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void InferencePostprocessPass::ApplyImpl(ir::Graph *graph) const { + VLOG(10) << "enter InferencePostprocessPass::ApplyImpl"; + + std::vector feed_list; + feed_list = Get>("feed_list"); + std::vector fetch_list; + fetch_list = Get>("fetch_list"); + + auto *feed_var = new paddle::framework::VarDesc("feed"); + feed_var->SetType(proto::VarType::FEED_MINIBATCH); + auto *feed_var_node = graph->CreateVarNode(feed_var); + + auto *fetch_var = new paddle::framework::VarDesc("fetch"); + fetch_var->SetType(proto::VarType::FETCH_LIST); + auto *fetch_var_node = graph->CreateVarNode(fetch_var); + + for (int i = 0; i < feed_list.size(); i++) { + for (auto node : graph->Nodes()) { + if (node->Name() == feed_list[i]) { + auto *op = new paddle::framework::OpDesc(); + op->SetType("feed"); + op->SetInput("X", {"feed"}); + op->SetOutput("Out", {node->Name()}); + op->SetAttr("col", i); + auto *op_node = graph->CreateOpNode(op); + node->inputs.push_back(op_node); + op_node->outputs.push_back(node); + feed_var_node->outputs.push_back(op_node); + op_node->inputs.push_back(feed_var_node); + break; + } + } + } + + for (int i = 0; i < fetch_list.size(); i++) { + for (auto node : graph->Nodes()) { + if (node->Name() == fetch_list[i]) { + auto *op = new paddle::framework::OpDesc(); + op->SetType("fetch"); + op->SetInput("X", {node->Name()}); + op->SetOutput("Out", {"fetch"}); + op->SetAttr("col", i); + auto *op_node = graph->CreateOpNode(op); + node->outputs.push_back(op_node); + op_node->inputs.push_back(node); + fetch_var_node->inputs.push_back(op_node); + op_node->outputs.push_back(fetch_var_node); + break; + } + } + } + + VLOG(10) << "leave InferencePostprocessPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inference_postprocess_pass, + paddle::framework::ir::InferencePostprocessPass) + .RequirePassAttr("feed_list") + .RequirePassAttr("fetch_list"); diff --git a/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h new file mode 100644 index 00000000000000..e80e1905d4ad79 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/inference_postprocess_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class InferencePostprocessPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc new file mode 100644 index 00000000000000..d02dcce0cc62c8 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc @@ -0,0 +1,129 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/inference_process_pass.h" + +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" +#include "paddle/fluid/platform/device/ipu/ipu_strategy.h" + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter InferenceProcessPass::ApplyImpl"; + + // Get a new instance of ipu_backend + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetNewInstance(); + + // Set scope + auto& scope = graph->Get(kParamScopeAttr); + ipu_backend->SetScope(scope); + + // Set ipu_strategy + static std::shared_ptr ipu_strategy_instance_( + new platform::ipu::IpuStrategy()); + ipu_strategy_instance_->is_training = false; + auto num_ipus = graph->Get("num_ipus"); + ipu_strategy_instance_->num_ipus = num_ipus; + if (num_ipus > 1) { + ipu_strategy_instance_->popart_options_.virtualGraphMode = + platform::ipu::VirtualGraphMode::Manual; + } else { + ipu_strategy_instance_->popart_options_.virtualGraphMode = + platform::ipu::VirtualGraphMode::Off; + } + + auto enable_pipelining = graph->Get("enable_pipelining"); + ipu_strategy_instance_->popart_options_.enablePipelining = enable_pipelining; + if (enable_pipelining) { + auto batches_per_step = graph->Get("batches_per_step"); + PADDLE_ENFORCE_GE( + batches_per_step, num_ipus, + platform::errors::InvalidArgument("Batched per step should be equal or " + "greater than the number of IPUs")); + ipu_strategy_instance_->batches_per_step = batches_per_step; + } + ipu_strategy_instance_->batch_size = graph->Get("batch_size"); + ipu_strategy_instance_->need_avg_shard = graph->Get("need_avg_shard"); + + ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get())); + + // Get feed_list and fetch list + std::vector feed_list = {}; + std::vector fetch_list = {}; + for (auto node : graph->Nodes()) { + if (node->Name() == "feed") { + if (node->IsOp()) { + feed_list.push_back(""); + } + } else if (node->Name() == "fetch") { + if (node->IsOp()) { + fetch_list.push_back(""); + } + } + } + for (auto node : graph->Nodes()) { + if (node->Name() == "feed") { + if (node->IsOp()) { + feed_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] = + node->outputs[0]->Name(); + } + } else if (node->Name() == "fetch") { + if (node->IsOp()) { + fetch_list[BOOST_GET_CONST(int, node->Op()->GetAttr("col"))] = + node->inputs[0]->Name(); + } + } + } + + // Run passes + std::vector graph_pass = {"forward_graph_extract_pass", + "infer_shape_pass", "avg_shard_pass", + "popart_canonicalization_pass"}; + std::vector compile_pass = { + "ipu_inplace_pass", "ipu_graph_builder_pass", "ipu_runtime_replacer_pass", + "inference_postprocess_pass"}; + for (auto pass_name : graph_pass) { + auto pass = PassRegistry::Instance().Get(pass_name); + if (pass_name == "infer_shape_pass") { + pass->Set("feed_list", new std::vector(feed_list.begin(), + feed_list.end())); + } + pass->Apply(graph); + } + + for (auto pass_name : compile_pass) { + auto pass = PassRegistry::Instance().Get(pass_name); + pass->Set("feed_list", + new std::vector(feed_list.begin(), feed_list.end())); + pass->Set("fetch_list", new std::vector(fetch_list.begin(), + fetch_list.end())); + pass->Apply(graph); + } + + VLOG(10) << "leave InferenceProcessPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(inference_process_pass, + paddle::framework::ir::InferenceProcessPass); diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.h b/paddle/fluid/framework/ir/ipu/inference_process_pass.h new file mode 100644 index 00000000000000..bac0e88377f7c6 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class InferenceProcessPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc new file mode 100644 index 00000000000000..5a53466089bc88 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.cc @@ -0,0 +1,52 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h" + +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" + +namespace paddle { +namespace framework { +namespace ir { + +void IpuGraphBuilderPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter IpuGraphBuilderPass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + std::vector feed_list; + feed_list = Get>("feed_list"); + + std::vector fetch_list; + fetch_list = Get>("fetch_list"); + + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetInstance(); + + ipu_backend->Compile(graph, feed_list, fetch_list); + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave IpuGraphBuilderPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(ipu_graph_builder_pass, + paddle::framework::ir::IpuGraphBuilderPass) + .RequirePassAttr("feed_list") + .RequirePassAttr("fetch_list"); diff --git a/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h new file mode 100644 index 00000000000000..6237df36480335 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_graph_builder_pass.h @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IpuGraphBuilderPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc new file mode 100644 index 00000000000000..d3f1f1633ffc94 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h" + +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::string GenerateVarName(Node *node) { + return node->Name() + "_" + std::to_string(node->id()); +} + +void IpuInplacePass::ApplyImpl(ir::Graph *graph) const { + // use this pass after forward_graph_extract_pass + // raise error if the inplaced var both in feed_list & fetch_list + VLOG(10) << "enter IpuInplacePass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + std::vector feed_list; + feed_list = Get>("feed_list"); + std::vector fetch_list; + fetch_list = Get>("fetch_list"); + + std::map var_name; + for (auto *node : graph->Nodes()) { + if (node->IsVar()) { + if (var_name.find(node->Name()) == var_name.end()) { + var_name.emplace(node->Name(), 1); + } else { + var_name[node->Name()]++; + } + } + } + + for (auto *node : graph->Nodes()) { + if (node->IsVar()) { + if (var_name[node->Name()] > 1) { + auto is_feed = (std::find(feed_list.begin(), feed_list.end(), + node->Name()) != feed_list.end()) && + (node->inputs.size() == 0); + auto is_fetch = (std::find(fetch_list.begin(), fetch_list.end(), + node->Name()) != fetch_list.end()) && + (node->outputs.size() == 0); + if (!is_feed && !is_fetch && !node->Var()->Persistable()) { + auto old_name = node->Name(); + auto new_name = GenerateVarName(node); + node->RenameVar(new_name); + for (auto *op_in : node->inputs) { + op_in->Op()->RenameOutput(old_name, new_name); + } + for (auto *op_out : node->outputs) { + op_out->Op()->RenameInput(old_name, new_name); + } + } + } + } + } + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave IpuInplacePass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(ipu_inplace_pass, paddle::framework::ir::IpuInplacePass) + .RequirePassAttr("feed_list") + .RequirePassAttr("fetch_list"); diff --git a/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h new file mode 100644 index 00000000000000..86756276c8c3dc --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_inplace_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IpuInplacePass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc new file mode 100644 index 00000000000000..ba9233eeb8cb95 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +void IPUPassBase::Init(const std::string& repr, Graph* graph) const { + repr_ = repr; + graph_ = graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/ipu_pass_base.h b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h new file mode 100644 index 00000000000000..b56d3e4c65b1c0 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_pass_base.h @@ -0,0 +1,37 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IPUPassBase : public Pass { + public: + void Init(const std::string& repr, Graph* graph) const; + virtual ~IPUPassBase() {} + + protected: + mutable Graph* graph_; + mutable std::string repr_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc new file mode 100644 index 00000000000000..a3e020714e1db9 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h" + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void IpuRuntimeReplacerPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter IpuRuntimeReplacerPass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + std::vector feed_list; + feed_list = Get>("feed_list"); + + std::vector fetch_list; + fetch_list = Get>("fetch_list"); + + framework::OpDesc ipu_rt_op_desc; + ipu_rt_op_desc.SetType("ipu_runtime"); + ipu_rt_op_desc.SetInput("FeedList", feed_list); + ipu_rt_op_desc.SetOutput("FetchList", fetch_list); + ipu_rt_op_desc.Flush(); + + // Create a new node for the ipu_runtime_op. + auto* ipu_rt_node = graph->CreateOpNode(&ipu_rt_op_desc); + + for (auto* node : graph->Nodes()) { + if (node->IsVar()) { + for (auto feed : feed_list) { + if (node->Name() == feed) { + IR_NODE_LINK_TO(node, ipu_rt_node); + } + } + for (auto fetch : fetch_list) { + if (node->Name() == fetch) { + IR_NODE_LINK_TO(ipu_rt_node, node); + } + } + } + } + + // set ipu_runtime_op dtype attr + if (fetch_list.size() == 1) { + for (auto* node : graph->Nodes()) { + if (node->IsVar()) { + for (auto fetch : fetch_list) { + if (node->Name() == fetch) { + ipu_rt_node->Op()->SetAttr("dtype", node->Var()->GetDataType()); + } + } + } + } + } + + // Remove unneeded nodes. + std::unordered_set marked_nodes; + for (auto* node : graph->Nodes()) { + if (node->IsOp()) { + auto* op_desc = node->Op(); + if (op_desc->Type() != "ipu_runtime") { + marked_nodes.insert(node); + } + } + } + + GraphSafeRemoveNodes(graph, marked_nodes); + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave IpuRuntimeReplacerPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(ipu_runtime_replacer_pass, + paddle::framework::ir::IpuRuntimeReplacerPass) + .RequirePassAttr("feed_list") + .RequirePassAttr("fetch_list"); diff --git a/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h new file mode 100644 index 00000000000000..ba2cc8702fa473 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/ipu_runtime_replacer_pass.h @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IpuRuntimeReplacerPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc new file mode 100644 index 00000000000000..c6be2c775bd211 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h" + +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" + +namespace paddle { +namespace framework { +namespace ir { + +void IpuOptimizerExtractPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter IpuOptimizerExtractPass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + auto ipu_backend = paddle::platform::ipu::IpuBackend::GetInstance(); + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()) { + int op_role = BOOST_GET_CONST( + int, node->Op()->GetAttr( + framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + + // graph usually have multiple optimizer node for different parameter, + // and these node have the same type and attr value usually + if ((op_role == static_cast(framework::OpRole::kOptimize))) { + ipu_backend->GetExecutor().SetOptimizerType(node->Op()->Type()); + VLOG(10) << "found optimizer type: " << node->Op()->Type(); + + for (const std::string& attr_name : node->Op()->AttrNames()) { + auto attr_type = node->Op()->GetAttrType(attr_name); + // with adam, attr are float + if (attr_type == proto::AttrType::FLOAT) { + auto attr_value = + BOOST_GET_CONST(float, node->Op()->GetAttr(attr_name)); + ipu_backend->GetExecutor().SetOptimizerAttr(attr_name, attr_value); + } else { + VLOG(10) << "Skip " << attr_type; + } + } + + auto lr_var_name = node->Op()->Input("LearningRate"); + PADDLE_ENFORCE_EQ(lr_var_name.size(), 1u, + platform::errors::InvalidArgument( + "In op(%s), find input(LearningRate) failed.", + node->Op()->Type())); + + ipu_backend->GetExecutor().SetLRVarName(lr_var_name[0]); + } + + if ((op_role == static_cast(framework::OpRole::kLoss))) { + VLOG(10) << "found loss op type: " << node->Op()->Type(); + auto outputs = node->Op()->Outputs(); + PADDLE_ENFORCE_EQ( + outputs.size(), 1, + platform::errors::InvalidArgument("Can only support one loss key")); + + auto losses_name = outputs.begin()->second; + PADDLE_ENFORCE_EQ(losses_name.size(), 1, + platform::errors::InvalidArgument( + "Can only support one loss name")); + + ipu_backend->GetExecutor().SetLoss(losses_name[0]); + } + } + } + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave IpuOptimizerExtractPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(optimizer_extract_pass, + paddle::framework::ir::IpuOptimizerExtractPass); diff --git a/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h new file mode 100644 index 00000000000000..fd274ded8f5bd1 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/optimizer_extract_pass.h @@ -0,0 +1,31 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class IpuOptimizerExtractPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc new file mode 100644 index 00000000000000..c23bfdcb154f16 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h" +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/device/ipu/common.h" +#include "paddle/fluid/platform/device/ipu/ipu_backend.h" + +namespace paddle { +namespace framework { +namespace ir { + +using paddle::platform::ipu::IpuBackend; +using framework::ir::Graph; +using framework::ir::Node; + +void IpuOptimizerStateAlignPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter IpuOptimizerStateAlignPass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + auto ipu_backend = IpuBackend::GetInstance(); + const auto* scope_ = ipu_backend->GetScope(); + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()) { + int op_role = BOOST_GET_CONST( + int, node->Op()->GetAttr( + framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + + if ((op_role == static_cast(framework::OpRole::kOptimize))) { + auto inputs = node->Op()->Inputs(); + if (inputs.count(platform::ipu::sBeta1Pow)) { + auto var = scope_->GetVar(inputs.at(platform::ipu::sBeta1Pow)[0]); + auto data = var->GetMutable()->data(); + auto beta = BOOST_GET_CONST( + float, node->Op()->GetAttr(platform::ipu::sBeta1)); + + // ensure current save with beta1pow, rather than step. + // beta1pow = beta1 ^ (step + 1). Just set beta1pow because popart + // support single Step__ + bool save_with_beta1pow = (data[0] < 1.0f) && (data[0] > 0.0f); + float step = 0; + float beta_acc = beta; + while (beta_acc > data[0] && save_with_beta1pow) { + beta_acc *= beta; + step += 1; + } + + if (save_with_beta1pow) { + data[0] = step; + } + } + } + } + } + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave IpuOptimizerStateAlignPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(optimizer_state_align_pass, + paddle::framework::ir::IpuOptimizerStateAlignPass); diff --git a/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h new file mode 100644 index 00000000000000..21a1017d88452a --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/optimizer_state_align_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * This pass should only affect optimizer that need bias correction, + * include Adam/Lamb. + */ + +class IpuOptimizerStateAlignPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc new file mode 100644 index 00000000000000..c97b7fd5bcb0cb --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h" + +#include "paddle/fluid/framework/ir/pass_tester_helper.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/post_canonicalization.h" + +namespace paddle { +namespace framework { +namespace ir { + +using framework::ir::Graph; +using framework::ir::Node; +using platform::ipu::SymbolHandler; + +void PopartCanonicalizationPass::ApplyImpl(ir::Graph* graph) const { + VLOG(10) << "enter PopartCanonicalizationPass::ApplyImpl"; + VLOG(10) << "Raw Graph: "; + VLOG(10) << DebugString(graph); + + auto nodes = graph->Nodes(); + for (auto* node : nodes) { + if (!node->IsOp()) { + continue; + } + auto* op = node->Op(); + auto op_type = op->Type(); + + ir::Node* new_node = nullptr; + SymbolHandler handler = platform::ipu::GetHandler(op_type); + if (handler) { + VLOG(11) << "Raw Paddle Node:"; + VLOG(11) << node->Op()->Proto()->DebugString(); + new_node = handler(graph, node); + VLOG(11) << "Post Popart Node:"; + VLOG(11) << new_node->Op()->Proto()->DebugString(); + + platform::ipu::ClearNode(node); + graph->RemoveNode(node); + } else { + LOG(ERROR) << "Can not find OpHandler for op_type: " << op_type; + } + } + + VLOG(10) << "Post Graph: "; + VLOG(10) << DebugString(graph); + VLOG(10) << "leave PopartCanonicalizationPass::ApplyImpl"; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(popart_canonicalization_pass, + paddle::framework::ir::PopartCanonicalizationPass); diff --git a/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h new file mode 100644 index 00000000000000..6690873f2a9ac2 --- /dev/null +++ b/paddle/fluid/framework/ir/ipu/popart_canonicalization_pass.h @@ -0,0 +1,30 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/ipu/ipu_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +class PopartCanonicalizationPass : public IPUPassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index 523c2161326466..f3d96c38506564 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -84,13 +84,16 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse( LOG(WARNING) << "Pass in op compat failed."; return; } + const int kNumFields = 5; const int kTransOffset = 1; const int kTransOutOffset = 2; const int kFlattenOffset = 3; const int kFlattenOutOffset = 4; - std::vector nodes; + std::vector nodes; + std::vector trans_axis0; + int flatten_axis0; for (int i = 0; i < times; i++) { PADDLE_ENFORCE_NOT_NULL( subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))), @@ -112,6 +115,33 @@ void TransposeFlattenConcatFusePass::RunTransposeFlattenConcatFuse( platform::errors::NotFound("Can not find %s in subgraph.", input_nodes[i]->name())); + if (i == 0) { + trans_axis0 = BOOST_GET_CONST( + std::vector, + subgraph.at(pattern.GetPDNode("transpose" + std::to_string(0))) + ->Op() + ->GetAttr("axis")); + flatten_axis0 = BOOST_GET_CONST( + int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0))) + ->Op() + ->GetAttr("axis")); + } else { + std::vector trans_axis = BOOST_GET_CONST( + std::vector, + subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i))) + ->Op() + ->GetAttr("axis")); + // All axis of transpose should be the same + if (trans_axis0 != trans_axis) return; + + int flatten_axis = BOOST_GET_CONST( + int, subgraph.at(pattern.GetPDNode("flatten" + std::to_string(0))) + ->Op() + ->GetAttr("axis")); + // All axis of flatten should be the same + if (flatten_axis0 != flatten_axis) return; + } + nodes.push_back(subgraph.at(input_nodes[i])); nodes.push_back( subgraph.at(pattern.GetPDNode("transpose" + std::to_string(i)))); diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt index b13166cff60aa3..8d2ee2f01008bf 100644 --- a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -2,7 +2,7 @@ cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper l cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector graph_pattern_detector cinn_compiler errors enforce) cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn) cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph transform_desc cinn) -cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn) +cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS framework_proto graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn cinn_launch_context) if (WITH_TESTING) cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc index 368fb4a5fd8c96..0e157ae7d79f38 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc @@ -29,55 +29,32 @@ namespace paddle { namespace framework { namespace paddle2cinn { +using GraphHashStrategy = CinnCacheKey::GraphHashStrategy; + +CinnCacheKey::CinnCacheKey(GraphHashStrategy graph_hash) + : graph_hash_(graph_hash) {} + CinnCacheKey::CinnCacheKey( const ir::Graph& graph, const std::map& input_tensors, - const std::string& arch_str) { + const std::string& arch_str, GraphHashStrategy graph_hash) + : graph_hash_(graph_hash) { this->SetKey(graph, input_tensors, arch_str); } CinnCacheKey::CinnCacheKey(const ir::Graph& graph, const std::map& input_shapes, - const std::string& arch_str) { + const std::string& arch_str, + GraphHashStrategy graph_hash) + : graph_hash_(graph_hash) { this->SetKey(graph, input_shapes, arch_str); } -size_t CinnCacheKey::HashGraph(const ir::Graph& graph) { - // using Dot to unqiue graph - inference::analysis::Dot dot; - std::unordered_map node2dot; - int id = 0; - // Create nodes - // graph.Nodes() return unordered_set, the same graph may - // return different result? - for (const ir::Node* n : graph.Nodes()) { - std::string node_id = std::to_string(id++); - dot.AddNode(node_id, {}, n->Name(), true); - node2dot[n] = node_id; - } - - // Create edges - for (const ir::Node* n : graph.Nodes()) { - const auto& src_id = node2dot.at(n); - for (auto* out : n->outputs) { - const auto& dest_id = node2dot.at(out); - dot.AddEdge(src_id, dest_id, {}); - } - } - - const std::string& viz_graph = dot.Build(); - VLOG(1) << "The hash graph:\n" << viz_graph; - - size_t hash_val = std::hash()(viz_graph); - VLOG(4) << "The graph's hash value is: " << hash_val; - return hash_val; -} - void CinnCacheKey::SetKey( const ir::Graph& graph, const std::map& input_tensors, const std::string& arch_str) { - graph_serialize_str_ = std::to_string(HashGraph(graph)); + graph_hash_val_ = graph_hash_(graph); for (const auto& name_tensor : input_tensors) { input_shapes_[name_tensor.first] = name_tensor.second->dims(); } @@ -87,7 +64,7 @@ void CinnCacheKey::SetKey( void CinnCacheKey::SetKey(const ir::Graph& graph, const std::map& input_shapes, const std::string& arch_str) { - graph_serialize_str_ = std::to_string(HashGraph(graph)); + graph_hash_val_ = graph_hash_(graph); input_shapes_ = input_shapes; arch_str_ = arch_str; } @@ -97,7 +74,7 @@ bool CinnCacheKey::operator!=(const CinnCacheKey& other) const { } bool CinnCacheKey::operator==(const CinnCacheKey& other) const { - return graph_serialize_str_ == other.graph_serialize_str_ && + return graph_hash_val_ == other.graph_hash_val_ && input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_; } @@ -114,11 +91,48 @@ size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const { ret = hash_combine(ret, string_hasher(name_shape.second.to_str())); } - ret = hash_combine(ret, string_hasher(key.graph_serialize_str_)); + ret = hash_combine(ret, key.graph_hash_val_); ret = hash_combine(ret, string_hasher(key.arch_str_)); return ret; } +size_t CinnCacheKeyByStructure::HashGraph(const ir::Graph& graph) { + // sort grad node by name and id. + auto compare = [](ir::Node* n1, ir::Node* n2) { + return (n1->Name() == n2->Name()) ? (n1->id() < n2->id()) + : (n1->Name() < n2->Name()); + }; + + // graph.Nodes() return unordered_set, here using set to avoid the same graph + // may return different result + std::set node_set(compare), + output_set(compare); + node_set.insert(graph.Nodes().begin(), graph.Nodes().end()); + + std::string hash_str; + for (ir::Node* n : node_set) { + hash_str.append(n->Name()); + + output_set.clear(); + output_set.insert(n->outputs.begin(), n->outputs.end()); + for (auto* out : output_set) { + hash_str.append(out->Name()); + } + } + + VLOG(1) << "The hash graph:\n" << hash_str; + + size_t hash_val = std::hash()(hash_str); + VLOG(4) << "The graph's hash value by graph structure is: " << hash_val; + return hash_val; +} + +size_t CinnCacheKeyByAddress::HashGraph(const ir::Graph& graph) { + size_t hash_val = reinterpret_cast(&graph); + VLOG(4) << "The graph's hash value by graph address is: " << hash_val; + return hash_val; +} + } // namespace paddle2cinn } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h index 941f8e0cdecc18..67325297c47724 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/framework/ddim.h" @@ -33,14 +34,18 @@ namespace paddle2cinn { // shapes. class CinnCacheKey { public: + using GraphHashStrategy = std::function; + + explicit CinnCacheKey(GraphHashStrategy graph_hash); + CinnCacheKey(const ir::Graph& graph, const std::map& input_tensors, - const std::string& arch_str); + const std::string& arch_str, GraphHashStrategy graph_hash); CinnCacheKey(const ir::Graph& graph, const std::map& input_shapes, - const std::string& arch_str); + const std::string& arch_str, GraphHashStrategy graph_hash); - ~CinnCacheKey() {} + ~CinnCacheKey() = default; void SetKey(const ir::Graph& graph, const std::map& input_tensors, @@ -58,13 +63,38 @@ class CinnCacheKey { }; private: - size_t HashGraph(const ir::Graph& graph); - - std::string graph_serialize_str_; + GraphHashStrategy graph_hash_; + size_t graph_hash_val_; std::map input_shapes_; std::string arch_str_; }; +#define CINN_CACHE_KEY_CREATE(NAME) \ + class NAME : public CinnCacheKey { \ + public: \ + NAME() : CinnCacheKey(HashGraph) {} \ + \ + NAME(const ir::Graph& graph, \ + const std::map& input_tensors, \ + const std::string& arch_str) \ + : CinnCacheKey(graph, input_tensors, arch_str, HashGraph) {} \ + \ + NAME(const ir::Graph& graph, \ + const std::map& input_shapes, \ + const std::string& arch_str) \ + : CinnCacheKey(graph, input_shapes, arch_str, HashGraph) {} \ + \ + private: \ + static size_t HashGraph(const ir::Graph& graph); \ + }; + +// Class to store the keys by graph address for compiling CINN. +CINN_CACHE_KEY_CREATE(CinnCacheKeyByAddress) +// Class to store the keys by graph structure for compiling CINN. +CINN_CACHE_KEY_CREATE(CinnCacheKeyByStructure) + +#undef CINN_CACHE_KEY_CREATE + } // namespace paddle2cinn } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc index f13f44998211f4..f9b48ef4b5ec0c 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc @@ -26,8 +26,8 @@ namespace paddle { namespace framework { namespace paddle2cinn { -TEST(CinnCacheKeyTest, TestAsUnorderedKey) { - std::unordered_set test_set; +TEST(CinnCacheKeyTest, TestAsUnorderedKeyByStructure) { + std::unordered_set test_set; ProgramDesc empty_program; ir::Graph empty_graph(empty_program); @@ -47,19 +47,20 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) { DDim ddim = paddle::framework::make_ddim({1, 2, 3}); std::map feed_shapes = {{"X", ddim}}; - CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86"); - CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86"); + CinnCacheKeyByStructure cache_key0(empty_graph, feed_tensors, "x86"); + CinnCacheKeyByStructure cache_key1(empty_graph, feed_shapes, "x86"); EXPECT_EQ(cache_key0, cache_key1); - CinnCacheKey cache_key2(graph, feed_shapes, "x86"); - CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu"); - CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu"); + CinnCacheKeyByStructure cache_key2(graph, feed_shapes, "x86"); + CinnCacheKeyByStructure cache_key3(graph, feed_shapes, "nvgpu"); + CinnCacheKeyByStructure cache_key4(graph, feed_tensors, "nvgpu"); EXPECT_NE(cache_key2, cache_key3); EXPECT_EQ(cache_key3, cache_key4); - CinnCacheKey cache_key5(empty_graph, - std::map(), "unk"); - CinnCacheKey cache_key6(empty_graph, std::map(), "unk"); + CinnCacheKeyByStructure cache_key5( + empty_graph, std::map(), "unk"); + CinnCacheKeyByStructure cache_key6(empty_graph, std::map(), + "unk"); EXPECT_EQ(cache_key5, cache_key6); EXPECT_NE(cache_key1, cache_key3); @@ -98,6 +99,107 @@ TEST(CinnCacheKeyTest, TestAsUnorderedKey) { EXPECT_EQ(test_set.find(cache_key6), test_set.end()); } +TEST(CinnCacheKeyTest, TestAsUnorderedKeyByAddress) { + std::unordered_set test_set; + + ProgramDesc empty_program; + ir::Graph empty_graph(empty_program); + + ProgramDesc program; + auto *global_block = program.MutableBlock(0); + auto *x = global_block->Var("X"); + x->SetType(proto::VarType::LOD_TENSOR); + ir::Graph graph(program); + + LoDTensor tensor; + tensor.Resize({1, 2, 3}); + const LoDTensor *tensor_pointer = &tensor; + std::map feed_tensors = { + {"X", tensor_pointer}}; + + DDim ddim = paddle::framework::make_ddim({1, 2, 3}); + std::map feed_shapes = {{"X", ddim}}; + + CinnCacheKeyByAddress cache_key0(empty_graph, feed_tensors, "x86"); + CinnCacheKeyByAddress cache_key1(empty_graph, feed_shapes, "x86"); + EXPECT_EQ(cache_key0, cache_key1); + + CinnCacheKeyByAddress cache_key2(graph, feed_shapes, "x86"); + CinnCacheKeyByAddress cache_key3(graph, feed_shapes, "nvgpu"); + CinnCacheKeyByAddress cache_key4(graph, feed_tensors, "nvgpu"); + EXPECT_NE(cache_key2, cache_key3); + EXPECT_EQ(cache_key3, cache_key4); + + CinnCacheKeyByAddress cache_key5( + empty_graph, std::map(), "unk"); + CinnCacheKeyByAddress cache_key6(empty_graph, std::map(), + "unk"); + EXPECT_EQ(cache_key5, cache_key6); + + EXPECT_NE(cache_key1, cache_key3); + EXPECT_NE(cache_key4, cache_key2); + + EXPECT_NE(cache_key3, cache_key5); + EXPECT_NE(cache_key6, cache_key4); + + EXPECT_NE(cache_key5, cache_key1); + EXPECT_NE(cache_key2, cache_key6); + + test_set.insert(cache_key0); + test_set.insert(cache_key1); + test_set.insert(cache_key3); + test_set.insert(cache_key4); + test_set.insert(cache_key5); + test_set.insert(cache_key6); + EXPECT_EQ(test_set.size(), 3U); + + auto iter = test_set.find(cache_key0); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 2U); + EXPECT_EQ(test_set.find(cache_key1), test_set.end()); + + iter = test_set.find(cache_key3); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 1U); + EXPECT_EQ(test_set.find(cache_key4), test_set.end()); + + iter = test_set.find(cache_key5); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 0U); + EXPECT_EQ(test_set.find(cache_key6), test_set.end()); +} + +TEST(CinnCacheKeyTest, TestSameGraph) { + ProgramDesc program1; + auto *global_block1 = program1.MutableBlock(0); + auto *x1 = global_block1->Var("X"); + x1->SetType(proto::VarType::LOD_TENSOR); + ir::Graph graph1(program1); + + ProgramDesc program2; + auto *global_block2 = program2.MutableBlock(0); + auto *x2 = global_block2->Var("X"); + x2->SetType(proto::VarType::LOD_TENSOR); + ir::Graph graph2(program2); + + LoDTensor tensor; + tensor.Resize({1, 2, 3}); + const LoDTensor *tensor_pointer = &tensor; + std::map feed_tensors = { + {"X", tensor_pointer}}; + + CinnCacheKeyByAddress cache_key_by_address1(graph1, feed_tensors, "x86"); + CinnCacheKeyByAddress cache_key_by_address2(graph2, feed_tensors, "x86"); + EXPECT_NE(cache_key_by_address1, cache_key_by_address2); + + CinnCacheKeyByStructure cache_key_by_struct1(graph1, feed_tensors, "x86"); + CinnCacheKeyByStructure cache_key_by_struct2(graph2, feed_tensors, "x86"); + EXPECT_EQ(cache_key_by_struct1, cache_key_by_struct2); +} + } // namespace paddle2cinn } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 7fc8eff3d31c9e..54167d95899d6f 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -41,6 +41,7 @@ #include "paddle/fluid/framework/rw_lock.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/inference/analysis/dot.h" +#include "paddle/fluid/operators/cinn/cinn_launch_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/string_helper.h" @@ -68,23 +69,41 @@ const CinnCompiledObject& CinnCompiler::Compile( const std::map& input_tensors, const Target& target, void* stream) { VLOG(1) << "-- The graph to be compiled is:\n" << VizGraph(graph); - CinnCacheKey cur_key(graph, input_tensors, target.arch_str()); + CinnCacheKeyByAddress cur_key_by_address(graph, input_tensors, + target.arch_str()); + CinnCacheKeyByStructure cur_key_by_struct; + bool exist = false; { AutoRDLock r_guard{&rwlock_}; - exist = cache_.count(cur_key) != 0; + exist = cache_by_address_.count(cur_key_by_address) != 0; + // if cannot find graph by address, checkout whether the graph structure + // have been stored in cache. + if (!exist) { + // generate the structure cache key + cur_key_by_struct.SetKey(graph, input_tensors, target.arch_str()); + + // if the graph structure can be found, storing the graph address in + // cache for next query. + if (cache_by_struct_.count(cur_key_by_struct) != 0) { + exist = true; + cache_by_address_[cur_key_by_address] = + cache_by_struct_.at(cur_key_by_struct).get(); + } + } } if (!exist) { std::int64_t compiled_num = real_compiled_num_.fetch_add(1); auto compiled_res = CompileGraph(graph, input_tensors, target, compiled_num, stream); AutoWRLock w_guard{&rwlock_}; - if (!cache_.count(cur_key)) { - cache_[cur_key] = std::move(compiled_res); + if (!cache_by_struct_.count(cur_key_by_struct)) { + cache_by_address_[cur_key_by_address] = compiled_res.get(); + cache_by_struct_[cur_key_by_struct] = std::move(compiled_res); } } AutoRDLock guard{&rwlock_}; - const auto& cached_boj = *cache_[cur_key]; + const auto& cached_boj = *cache_by_address_[cur_key_by_address]; return cached_boj; } @@ -181,7 +200,8 @@ void CinnCompiler::Clear() { { AutoWRLock guard{&rwlock_}; graphs_.clear(); - cache_.clear(); + cache_by_address_.clear(); + cache_by_struct_.clear(); } real_compiled_num_.store(0); } @@ -217,6 +237,9 @@ std::unique_ptr CinnCompiler::CompileGraph( *compiled_obj = {std::move(graph_compiler), std::move(compiled_res.runtime_program), scope, symbol.var_model_to_program_map()}; + compiled_obj->launch_context = + std::make_unique( + compiled_obj->paddle2cinn_varmap, compiled_obj->scope); return compiled_obj; } diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h index 71119acf1fb49e..3bc60e55557070 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -31,6 +31,13 @@ #include "paddle/fluid/platform/macros.h" namespace paddle { + +namespace operators { +namespace details { +class CinnLaunchContext; +} +} + namespace framework { namespace paddle2cinn { @@ -39,6 +46,7 @@ struct CinnCompiledObject { std::unique_ptr<::cinn::hlir::framework::Program> runtime_program; std::shared_ptr<::cinn::hlir::framework::Scope> scope; std::unordered_map paddle2cinn_varmap; + std::unique_ptr launch_context; }; // Entrance to use CINN. @@ -87,9 +95,12 @@ class CinnCompiler { void* stream = nullptr) const; std::unordered_map> graphs_; - std::unordered_map, + std::unordered_map - cache_; + cache_by_address_; + std::unordered_map, CinnCacheKey::Hash> + cache_by_struct_; std::atomic_int64_t real_compiled_num_{0}; mutable RWLock rwlock_; diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index 55254c65fad598..9521df651f9de9 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/pten_utils.h" +#include "paddle/pten/core/convert_utils.h" #include "paddle/pten/core/kernel_factory.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -190,8 +191,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { } KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() { - return KernelSignature(op_proto_->type(), GetInputArgsNames(), - GetAttrsArgsNames(), GetOutputArgsNames()); + return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()), + GetInputArgsNames(), GetAttrsArgsNames(), + GetOutputArgsNames()); } std::string KernelSignatureToString(const KernelSignature& signature) { diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 8875ef74bce14e..54f46e49c4f730 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -487,6 +487,14 @@ static void PreparedOpRunImpl( op.Type(), outs, dev_ctx->GetPlace()); } + if (FLAGS_benchmark) { + dev_ctx->Wait(); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); + VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; +#endif + } + /** * [ Why need handle complex gradient to real gradient? ] * diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 90a4ca73399cf5..f6337a754f91ae 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -32,7 +32,30 @@ CinnLaunchContext::CinnLaunchContext( [](const auto& name_view) { return std::string(name_view.data()); }); } -bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) { +void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope, + const platform::Place& place) { + if (std::addressof(scope) == cached_scope_ && + std::addressof(place) == cached_place_) { + VLOG(4) << "Captured scope:" << cached_scope_ << ", place:" << cached_place_ + << " are not changed"; + return; + } + cached_scope_ = std::addressof(scope); + cached_place_ = std::addressof(place); + cached_temp_scope_ = scope.NewTmpScope(); + VLOG(4) << "Captured env is update, scope:" << cached_scope_ << "->" + << std::addressof(scope) << ", place:" << cached_place_ << "->" + << std::addressof(place); +} + +bool CinnLaunchContext::IsArgumentsInitialized() const { + if (hold_buffers_.empty() || name2argument_.empty()) { + return false; + } + return true; +} + +bool CinnLaunchContext::IsVariableUsed(const std::string& paddle_name) const { return paddle2cinn_varmap_.count(paddle_name) > 0 && cinn_variable_names_.count(paddle2cinn_varmap_.at(paddle_name)) > 0; } @@ -67,85 +90,83 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& paddle_name, // TODO(CtfGo): check the underlying data type after CINN ready } -void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name, - const platform::Place& place, - LoDTensor* paddle_tensor) { +void CinnLaunchContext::AssignExternalVariable(const std::string& paddle_name) { PADDLE_ENFORCE_EQ(IsVariableUsed(paddle_name), true, platform::errors::InvalidArgument( "Paddle variable(%s) not used by cinn", paddle_name)); const auto& cinn_name = paddle2cinn_varmap_.at(paddle_name); + const auto& paddle_tensor = + cached_scope_->GetVar(paddle_name)->Get(); CinnTensor cinn_tensor = GetCinnTensor(cinn_name); - if (!paddle_tensor->IsInitialized()) { - paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data())); + if (paddle_tensor.IsInitialized()) { + CheckTensorEquivalent(paddle_name, paddle_tensor, cinn_tensor); } - CheckTensorEquivalent(paddle_name, *paddle_tensor, cinn_tensor); - return SetArgument(cinn_name, place, /* free_mem_callback = */ false, - paddle_tensor); + + auto cinn_buffer = std::make_unique(); + // assign dimensions and alloc/free callback of cinn_buffer_t + cinn_buffer->resize(cinn_tensor->shape().data().data(), + cinn_tensor->shape().data().size()); + cinn_buffer->external_malloc = new std::function( + [this, paddle_name](void* ctx, cinn_buffer_t* buffer) { + auto* tensor = + cached_scope_->GetVar(paddle_name)->GetMutable(); + tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions)); + buffer->memory = reinterpret_cast( + tensor->mutable_data(*cached_place_)); + return 0; + }); + + // external variables will be recycled by global gc, so do nothing here + cinn_buffer->external_free = new std::function( + [](void* ctx, cinn_buffer_t* buffer) { + // Do nothing + return 0; + }); + + return SetArgument(cinn_name, std::move(cinn_buffer)); } -void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name, - const platform::Place& place, - LoDTensor* paddle_tensor) { +void CinnLaunchContext::AssignInternalVariable(const std::string& cinn_name) { PADDLE_ENFORCE_GT(cinn_variable_names_.count(cinn_name), 0, platform::errors::InvalidArgument( "Variable(%s) not found in cinn socpe.", cinn_name)); CinnTensor cinn_tensor = GetCinnTensor(cinn_name); - if (!paddle_tensor->IsInitialized()) { - paddle_tensor->Resize(framework::make_ddim(cinn_tensor->shape().data())); - } - CheckTensorEquivalent(cinn_name, *paddle_tensor, cinn_tensor); - return SetArgument(cinn_name, place, /* free_mem_callback = */ true, - paddle_tensor); -} - -std::unique_ptr CinnLaunchContext::ShareTensorWithCinnBuffer( - const platform::Place& place, bool free_mem_callback, LoDTensor* tensor) { - // convert paddle dimensions array to cinn format - std::vector cinn_dims(tensor->dims().size()); - for (auto i = 0; i < tensor->dims().size(); ++i) { - cinn_dims[i] = static_cast(tensor->dims().at(i)); - } - auto cinn_buffer = std::make_unique(); - // assign size and memory - cinn_buffer->resize(cinn_dims.data(), cinn_dims.size()); + // assign dimensions and alloc/free callback of cinn_buffer_t + cinn_buffer->resize(cinn_tensor->shape().data().data(), + cinn_tensor->shape().data().size()); cinn_buffer->external_malloc = new std::function( - [place, tensor](void* ctx, cinn_buffer_t* buffer) { - buffer->memory = - reinterpret_cast(tensor->mutable_data(place)); + [this, cinn_name](void* ctx, cinn_buffer_t* buffer) { + auto* tensor = + cached_temp_scope_->Var(cinn_name)->GetMutable(); + tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions)); + buffer->memory = reinterpret_cast( + tensor->mutable_data(*cached_place_)); return 0; }); - if (free_mem_callback) { - cinn_buffer->external_free = new std::function( - [tensor](void* ctx, cinn_buffer_t* buffer) { - tensor->clear(); - return 0; - }); - return cinn_buffer; - } - + // internal variables should release its buffer immediately + // if no instruction use it cinn_buffer->external_free = new std::function( - [](void* ctx, cinn_buffer_t* buffer) { - // Do nothing + [this, cinn_name](void* ctx, cinn_buffer_t* buffer) { + auto* tensor = + cached_temp_scope_->GetVar(cinn_name)->GetMutable(); + tensor->clear(); return 0; }); - return cinn_buffer; + return SetArgument(cinn_name, std::move(cinn_buffer)); } void CinnLaunchContext::SetArgument(const std::string& cinn_name, - const platform::Place& place, - bool free_mem_callback, - LoDTensor* paddle_tensor) { - auto buffer = - ShareTensorWithCinnBuffer(place, free_mem_callback, paddle_tensor); + std::unique_ptr&& buffer) { + VLOG(4) << "SetArgument-" << name2argument_.size() << ": name(" << cinn_name + << "), dims(" << framework::DDim(buffer->dims, buffer->dimensions) + << ")."; + name2argument_.emplace(cinn_name, buffer.get()); hold_buffers_.emplace_back(std::move(buffer)); - VLOG(4) << "SetArgument-" << name2argument_.size() << ": " - << "name(" << cinn_name << "), dims(" << paddle_tensor->dims() - << ")."; } const std::map& diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index c990255d68253d..7bf70c9b9d0227 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -24,7 +24,7 @@ #include "cinn/runtime/cinn_runtime.h" #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/framework/scope.h" namespace paddle { namespace operators { @@ -40,16 +40,22 @@ class CinnLaunchContext { const std::unordered_map& paddle2cinn_varmap, const std::shared_ptr& cinn_scope); + // explicitly update several environment variables captured + // by callback of execution arguments + void UpdateCapturedEnv(const framework::Scope& scope, + const platform::Place& place); + + // Return whether execution arguments has been initialized + bool IsArgumentsInitialized() const; + // Return whether a Paddle variable used on compiled kernels - bool IsVariableUsed(const std::string& var_name); + bool IsVariableUsed(const std::string& paddle_name) const; // Assign tensor buffer to input or output variables - void AssignExternalVariable(const std::string& var_name, - const platform::Place& place, LoDTensor* tensor); + void AssignExternalVariable(const std::string& paddle_name); // Assign tensor buffer to internal variables - void AssignInternalVariable(const std::string& var_name, - const platform::Place& place, LoDTensor* tensor); + void AssignInternalVariable(const std::string& cinn_name); // Extract internal variable names from CinnScope // by excluding used input and output variables @@ -58,10 +64,6 @@ class CinnLaunchContext { // Finalize all execution arguments and return them const std::map& FinalizeArguments() const; - std::vector> HandoverBuffers() { - return std::move(hold_buffers_); - } - private: // Get CinnTensor with CINN variable name CinnTensor GetCinnTensor(const std::string& var_name); @@ -72,16 +74,15 @@ class CinnLaunchContext { const LoDTensor& paddle_tensor, const CinnTensor& cinn_tensor); - // Share the buffer of a Paddle tensor to CINN by delivering memory address - // to a cinn_buffer_t object - std::unique_ptr ShareTensorWithCinnBuffer( - const platform::Place& place, bool free_mem_callback, LoDTensor* tensor); - - // Set an argument with (cinn name)->(paddle tensor) pair - void SetArgument(const std::string& cinn_name, const platform::Place& place, - bool free_mem_callback, LoDTensor* paddle_tensor); + // Set an argument with (cinn name)->(cinn_buffer_t) pair + void SetArgument(const std::string& cinn_name, + std::unique_ptr&& buffer); private: + const framework::Scope* cached_scope_ = nullptr; + const platform::Place* cached_place_ = nullptr; + std::unique_ptr cached_temp_scope_ = nullptr; + // a variable name map from paddle to cinn const std::unordered_map& paddle2cinn_varmap_; // the variable scope of cinn diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc index d922e8355b44c5..da7640c3c0f682 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc @@ -45,81 +45,86 @@ std::unique_ptr CreateDefaultLaunchContext() { return std::make_unique(paddle2cinn_varmap, cinn_scope); } -TEST(CinnLaunchContextTest, TestIsVariableUsed) { +TEST(CinnLaunchContextTest, TestBasic) { auto launch_context = CreateDefaultLaunchContext(); - + // test IsVariableUsed ASSERT_EQ(launch_context->IsVariableUsed("var1"), true); ASSERT_EQ(launch_context->IsVariableUsed("var4"), false); -} - -TEST(CinnLaunchContextTest, TestGetInternalVariableNames) { - auto launch_context = CreateDefaultLaunchContext(); - auto internal_variable_names = launch_context->GetInternalVariableNames(); - ASSERT_EQ(internal_variable_names.size(), 3); - EXPECT_NE(internal_variable_names.find("cinn_var2"), - internal_variable_names.end()); + // test UpdateCapturedEnv + platform::CPUPlace place; + framework::Scope scope; + ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place)); + // test IsArgumentsInitialized + ASSERT_FALSE(launch_context->IsArgumentsInitialized()); } TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) { - auto launch_context = CreateDefaultLaunchContext(); platform::CPUPlace place; framework::Scope scope; + auto launch_context = CreateDefaultLaunchContext(); + launch_context->UpdateCapturedEnv(scope, place); auto* tensor1 = scope.Var("var1")->GetMutable(); // CheckTensorEquivalent: tensor dimension not equivalent tensor1->mutable_data(framework::make_ddim({3, 5}), place); - ASSERT_THROW(launch_context->AssignExternalVariable("var1", place, tensor1), + ASSERT_THROW(launch_context->AssignExternalVariable("var1"), paddle::platform::EnforceNotMet); } TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) { - auto launch_context = CreateDefaultLaunchContext(); platform::CPUPlace place; framework::Scope scope; + auto launch_context = CreateDefaultLaunchContext(); + launch_context->UpdateCapturedEnv(scope, place); auto* tensor4 = scope.Var("var4")->GetMutable(); // not used - ASSERT_THROW(launch_context->AssignExternalVariable("var4", place, tensor4), + ASSERT_THROW(launch_context->AssignExternalVariable("var4"), paddle::platform::EnforceNotMet); // not found - ASSERT_THROW( - launch_context->AssignExternalVariable("cinn_var4", place, tensor4), - paddle::platform::EnforceNotMet); + ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"), + paddle::platform::EnforceNotMet); } TEST(CinnLaunchContextTest, TestSetArgument) { + platform::CPUPlace cpu_place; + platform::Place place(cpu_place); + framework::Scope scope; auto launch_context = CreateDefaultLaunchContext(); + launch_context->UpdateCapturedEnv(scope, place); - platform::CPUPlace place; - framework::Scope scope; + // assign external variables auto* tensor1 = scope.Var("var1")->GetMutable(); float* data1 = tensor1->mutable_data(framework::make_ddim({3, 4}), place); data1[0] = 9.99f; data1[10] = 19.99f; + ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1")); - // assign external variable - ASSERT_NO_THROW( - launch_context->AssignExternalVariable("var1", place, tensor1)); - auto* tensor2 = scope.Var("var2")->GetMutable(); - tensor2->mutable_data(framework::make_ddim({6, 7, 8}), place); - ASSERT_NO_THROW( - launch_context->AssignInternalVariable("cinn_var2", place, tensor2)); - // FinalizeArguments not missed check - ASSERT_THROW(launch_context->FinalizeArguments(), - paddle::platform::EnforceNotMet); auto* tensor3 = scope.Var("var3")->GetMutable(); tensor3->mutable_data(framework::make_ddim({10, 16}), place); - ASSERT_NO_THROW( - launch_context->AssignExternalVariable("var3", place, tensor3)); + ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3")); + + // FinalizeArguments missed check + ASSERT_THROW(launch_context->FinalizeArguments(), + paddle::platform::EnforceNotMet); + // test get internal variables + auto internal_variable_names = launch_context->GetInternalVariableNames(); + ASSERT_EQ(internal_variable_names.size(), 1); + EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2"); + auto* tensor2 = scope.Var("var2")->GetMutable(); + tensor2->mutable_data(framework::make_ddim({6, 7, 8}), place); + ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2")); + + // check argument is set correctly and alloc/free callbacks work well auto name2argument = launch_context->FinalizeArguments(); ASSERT_EQ(name2argument.size(), 3); ASSERT_EQ(name2argument.count("cinn_var1"), 1); - // check ShareTensorWithCinnBuffer + ASSERT_TRUE(launch_context->IsArgumentsInitialized()); + auto* cinn_buffer = static_cast(name2argument.at("cinn_var1")); - ASSERT_EQ(cinn_buffer->memory, nullptr); cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer); ASSERT_NE(cinn_buffer->memory, nullptr); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc index 813e7b1152f87e..ea36a19202ef06 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op.cu.cc @@ -31,26 +31,6 @@ namespace operators { namespace details { #ifdef PADDLE_WITH_CUDA -void CUDART_CB ReleaseScope(void* data) { - auto* temp_scope = static_cast(data); - delete temp_scope; -} - -void CUDART_CB ReleaseBuffers(void* data) { - auto* buffers = - static_cast>*>(data); - delete buffers; -} - -template <> -void ReleaseResource( - const std::vector& resources, void* stream) { - PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc( - static_cast(stream), ReleaseScope, resources[0])); - PADDLE_ENFORCE_GPU_SUCCESS(cudaLaunchHostFunc( - static_cast(stream), ReleaseBuffers, resources[1])); -} - template <> void* GetStream( const framework::ExecutionContext& ctx) { diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index 3a272916332bea..8a5ca166941df0 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -56,25 +56,12 @@ void LaunchCinnExecution(const CinnCompiledObject& compiled_obj, // Set cinn FLAGS (such as FLAGS_cinn_cudnn_deterministic) with paddle's FLAGS. void SetCinnRuntimeFlags(); -template -void ReleaseResource(const std::vector& resources, void* stream) { - auto* temp_scope = static_cast(resources[0]); - auto* buffers = - static_cast>*>(resources[1]); - delete temp_scope; - delete buffers; -} - template void* GetStream(const framework::ExecutionContext& ctx) { return nullptr; } #ifdef PADDLE_WITH_CUDA -template <> -void ReleaseResource( - const std::vector& resources, void* stream); - template <> void* GetStream( const framework::ExecutionContext& ctx); @@ -116,56 +103,54 @@ class CinnLaunchOpKernel : public framework::OpKernel { compilation_key, inputs_name2tensor, target, stream); details::DebugCinnCompiledResult(cinn_compiled_object); - auto launch_context = std::make_unique( - cinn_compiled_object.paddle2cinn_varmap, cinn_compiled_object.scope); - + const auto& launch_context = cinn_compiled_object.launch_context; // Step 3. Prepare arguments needed for the compiled executable program. - VLOG(4) << "CinnLaunchOp prepare arguments"; - - // 3.1 Prepare input variables: tensors of input variables have - // been initialized before graph compiled, just check the - // equiality between tensors of paddle and cinn. - for (const auto& var_name : input_variable_names) { - if (!launch_context->IsVariableUsed(var_name)) { - // some input variables don't need for cinn because they are - // eliminated by optimized passes or some cinn operators use - // less variables - VLOG(4) << "Input variable(" << var_name << ") not used by cinn"; - continue; + launch_context->UpdateCapturedEnv(scope, place); + if (!launch_context->IsArgumentsInitialized()) { + VLOG(4) << "CinnLaunchOp prepare arguments"; + + // 3.1 Prepare input variables: tensors of input variables have + // been initialized before graph compiled, just check the + // equiality between tensors of paddle and cinn. + for (const auto& var_name : input_variable_names) { + if (!launch_context->IsVariableUsed(var_name)) { + // some input variables don't need for cinn because they are + // eliminated by optimized passes or some cinn operators use + // less variables + VLOG(4) << "Input variable(" << var_name << ") not used by cinn"; + continue; + } + + launch_context->AssignExternalVariable(var_name); } - launch_context->AssignExternalVariable( - var_name, place, scope.GetVar(var_name)->GetMutable()); - } - - // 3.2 Prepare output variables: all output variables should - // be initialized and allocated buffer before - // the runtime program start execution, the compilation result - // includes details of their buffer assginment and we use that to - // allocate space in Paddle. For those variables allocated yet, - // like persistable parameters, just check the equiality between - // Paddle allocation and CINN buffer assginment. - auto output_variable_names = ctx.OutputNames(kOutputs); - for (const auto var_name : output_variable_names) { - PADDLE_ENFORCE_EQ(launch_context->IsVariableUsed(var_name), true, - platform::errors::InvalidArgument( - "Output variable(%s) not used by cinn", var_name)); - - auto* tensor = scope.GetVar(var_name)->GetMutable(); - launch_context->AssignExternalVariable(var_name, place, tensor); - } + // 3.2 Prepare output variables: all output variables should + // be initialized and allocated buffer before + // the runtime program start execution, the compilation result + // includes details of their buffer assginment and we use that to + // allocate space in Paddle. For those variables allocated yet, + // like persistable parameters, just check the equiality between + // Paddle allocation and CINN buffer assginment. + auto output_variable_names = ctx.OutputNames(kOutputs); + for (const auto var_name : output_variable_names) { + PADDLE_ENFORCE_EQ( + launch_context->IsVariableUsed(var_name), true, + platform::errors::InvalidArgument( + "Output variable(%s) not used by cinn", var_name)); + + launch_context->AssignExternalVariable(var_name); + } - // 3.3 Prepare internal or temporary variables: Create a temporary - // scope to keep internal variables within graph or temporary - // variables needed by the compiled runtime program in addition. - // Here we directly use the names from CinnScope as Paddle variable - // names, because they will not be used outside the graph - // and should be destructed after computation finished. - auto internal_variable_names = launch_context->GetInternalVariableNames(); - framework::Scope* temp_scope = scope.NewTmpScope().release(); - for (const auto& var_name : internal_variable_names) { - auto* tensor = temp_scope->Var(var_name)->GetMutable(); - launch_context->AssignInternalVariable(var_name, place, tensor); + // 3.3 Prepare internal or temporary variables: Create a temporary + // scope to keep internal variables within graph or temporary + // variables needed by the compiled runtime program in addition. + // Here we directly use the names from CinnScope as Paddle variable + // names, because they will not be used outside the graph + // and should be destructed after computation finished. + auto internal_variable_names = launch_context->GetInternalVariableNames(); + for (const auto& var_name : internal_variable_names) { + launch_context->AssignInternalVariable(var_name); + } } // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. @@ -175,12 +160,6 @@ class CinnLaunchOpKernel : public framework::OpKernel { VLOG(4) << "Run Cinn compiled executable program with stream: " << stream; details::LaunchCinnExecution(cinn_compiled_object, *launch_context, stream); VLOG(4) << "CinnLaunchOp launch execution done."; - - // Step 6. Release some resources, such as `temp_scope` and cinn_buffers. - auto* buffers_holder = new std::vector>{ - launch_context->HandoverBuffers()}; - details::ReleaseResource({temp_scope, buffers_holder}, - stream); } }; diff --git a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc index 02373c38184fca..e10fdf522ff7c5 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_op_test.cc @@ -130,8 +130,9 @@ TEST(CinnLaunchOpTest, TestElementwiseAddPass) { scope.Var(test_out_name)->GetMutable(); scope.Var(expected_out_name)->GetMutable(); - cinn_launch_op->Run(scope, place); - elementwise_add_op->Run(scope, place); + platform::Place run_place(place); + cinn_launch_op->Run(scope, run_place); + elementwise_add_op->Run(scope, run_place); LoDTensor test_out, expected_out; TensorCopySync(scope.Var(test_out_name)->Get(), diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index ed06fac298a8fd..155eb1ebbe3db3 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -555,10 +555,10 @@ class Reshape2Op : public ReshapeOp { const framework::ExecutionContext &ctx) const override { auto multi_inputs = ctx.MultiInput("ShapeTensor"); if (multi_inputs.size() > 0) { - return framework::KernelSignature("reshape.mulhost", {"X", "ShapeTensor"}, + return framework::KernelSignature("reshape_mulhost", {"X", "ShapeTensor"}, {}, {"Out"}); } else if (ctx.HasInput("Shape")) { - return framework::KernelSignature("reshape.host", {"X", "Shape"}, {}, + return framework::KernelSignature("reshape_host", {"X", "Shape"}, {}, {"Out"}); } else { return framework::KernelSignature("reshape", {"X"}, {"shape"}, {"Out"}); diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index d443e78ed874f3..3e070da546b2ae 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -101,6 +101,20 @@ inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) { return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); } +#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 +static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) { + return *reinterpret_cast(&x); +} + +static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) { + return *reinterpret_cast<__half *>(&x); +} + +CUDA_ATOMIC_WRAPPER(Add, float16) { + return CUDAFP16ToPDFP16( + atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val))); +} +#else CUDA_ATOMIC_WRAPPER(Add, float16) { // concrete packed float16 value may exsits in lower or higher 16bits // of the 32bits address. @@ -133,6 +147,7 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { } } #endif +#endif CUDA_ATOMIC_WRAPPER(Add, complex) { float *real = reinterpret_cast(address); diff --git a/paddle/fluid/platform/device/ipu/CMakeLists.txt b/paddle/fluid/platform/device/ipu/CMakeLists.txt index 25629ba74d9152..9be12cbf6d4376 100644 --- a/paddle/fluid/platform/device/ipu/CMakeLists.txt +++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt @@ -1,5 +1,5 @@ -# IPU IF(WITH_IPU) + FILE(GLOB POPART_CANONICALIZATION_SRC ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/device/ipu/popart_canonicalization/*.cc) cc_library(ipu_device SRCS device.cc DEPS enforce popart) cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart) cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce) diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc new file mode 100644 index 00000000000000..5793c4c0e3ca69 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/activation_ops.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { +namespace { + +Node *activation_op_handler(Graph *graph, Node *node, const std::string &type) { + auto new_node = CreateBaseOp(graph, node, type, {GetInputVarNode("X", node)}, + node->outputs); + return new_node; +} + +Node *relu_handler(Graph *graph, Node *node) { + return activation_op_handler(graph, node, "popart_relu"); +} + +Node *tanh_handler(Graph *graph, Node *node) { + return activation_op_handler(graph, node, "popart_tanh"); +} + +Node *log_handler(Graph *graph, Node *node) { + return activation_op_handler(graph, node, "popart_log"); +} + +Node *sigmoid_handler(Graph *graph, Node *node) { + return activation_op_handler(graph, node, "popart_sigmoid"); +} + +Node *sqrt_handler(Graph *graph, Node *node) { + return activation_op_handler(graph, node, "popart_sqrt"); +} + +Node *gelu_handler(Graph *graph, Node *node) { + return activation_op_handler(graph, node, "popart_gelu_v2"); +} + +Node *log_softmax_handler(Graph *graph, Node *node) { + auto axis = BOOST_GET_CONST(int, node->Op()->GetAttr("axis")); + auto new_softmax = CreateSoftmaxOpset11(graph, node, node->inputs, {}, axis); + return CreateBaseOp(graph, node, "popart_log", new_softmax->outputs, + node->outputs); +} + +REGISTER_HANDLER(relu, relu_handler); +REGISTER_HANDLER(tanh, tanh_handler); +REGISTER_HANDLER(log, log_handler); +REGISTER_HANDLER(sigmoid, sigmoid_handler); +REGISTER_HANDLER(sqrt, sqrt_handler); +REGISTER_HANDLER(gelu, gelu_handler); +REGISTER_HANDLER(log_softmax, log_softmax_handler); + +} // namespace +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc new file mode 100644 index 00000000000000..d46fc55ec6ce0d --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.cc @@ -0,0 +1,185 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" + +namespace paddle { +namespace platform { +namespace ipu { + +// This avoids the static initialisation order fiasco, +std::unordered_map &SymbolHandlers() { + static std::unordered_map symbol_handlers; + return symbol_handlers; +} + +bool RegisterHandler(const std::string &symbol, const SymbolHandler &handler) { + if (SymbolHandlers().count(symbol) != 0) { + LOG(WARNING) << "Trying to register popart handler twice for operator: " + << symbol; + return false; + } + bool new_handler = SymbolHandlers().emplace(symbol, handler).second; + return new_handler; +} + +// Return a pointer to a handler if one is registered for this kind of node or +// an empty std::function otherwise. +SymbolHandler GetHandler(const std::string &kind) { + auto it = SymbolHandlers().find(kind); + if (it != SymbolHandlers().end()) { + return it->second; + } + return {}; +} + +void ConnectNodes(Node *first_node, Node *next_node) { + first_node->outputs.push_back(next_node); + next_node->inputs.push_back(first_node); +} + +void DisConnectNodes(Node *first_node, Node *next_node) { + auto rm_by_value = [&](std::vector &vec, Node *n) { + vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end()); + }; + rm_by_value(first_node->outputs, next_node); + rm_by_value(next_node->inputs, first_node); + rm_by_value(first_node->inputs, next_node); + rm_by_value(next_node->outputs, first_node); +} + +void ClearNode(Node *node) { + auto rm_by_value = [&](std::vector &vec, Node *n) { + vec.erase(std::remove(vec.begin(), vec.end(), n), vec.end()); + }; + for (auto *node_in : node->inputs) { + rm_by_value(node_in->outputs, node); + } + for (auto *node_out : node->outputs) { + rm_by_value(node_out->inputs, node); + } +} + +void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op, + bool override) { + if (new_op->HasAttr(attr_name) && !override) { + return; + } + if (op->HasAttr(attr_name)) { + VLOG(10) << "Copying attr: " << attr_name << " from " << op->Type() + << " to " << new_op->Type(); + new_op->SetAttr(attr_name, op->GetAttr(attr_name)); + new_op->Flush(); + } +} + +const int VarType2OnnxDtype(const int type) { + auto dtype = static_cast(type); + switch (dtype) { + case framework::proto::VarType::BOOL: + return static_cast(ONNXDataType::BOOL); + case framework::proto::VarType::INT16: + return static_cast(ONNXDataType::INT16); + case framework::proto::VarType::INT32: + return static_cast(ONNXDataType::INT32); + case framework::proto::VarType::INT64: + return static_cast(ONNXDataType::INT64); + case framework::proto::VarType::FP16: + return static_cast(ONNXDataType::FLOAT16); + case framework::proto::VarType::FP32: + return static_cast(ONNXDataType::FLOAT); + case framework::proto::VarType::FP64: + return static_cast(ONNXDataType::DOUBLE); + case framework::proto::VarType::UINT8: + return static_cast(ONNXDataType::UINT8); + case framework::proto::VarType::INT8: + return static_cast(ONNXDataType::INT8); + case framework::proto::VarType::BF16: + return static_cast(ONNXDataType::BFLOAT16); + case framework::proto::VarType::COMPLEX64: + return static_cast(ONNXDataType::COMPLEX64); + case framework::proto::VarType::COMPLEX128: + return static_cast(ONNXDataType::COMPLEX128); + default: + PADDLE_THROW( + platform::errors::Unimplemented("Unsupported data type: %d.", dtype)); + } +} + +const std::string VarType2PopStr(const int type) { + auto dtype = static_cast(type); + switch (dtype) { + case framework::proto::VarType::UINT8: + return "UINT8"; + case framework::proto::VarType::INT8: + return "INT8"; + case framework::proto::VarType::INT16: + return "INT16"; + case framework::proto::VarType::INT32: + return "INT32"; + case framework::proto::VarType::INT64: + return "INT64"; + case framework::proto::VarType::BOOL: + return "BOOL"; + case framework::proto::VarType::FP64: + return "DOUBLE"; + case framework::proto::VarType::FP32: + return "FLOAT"; + case framework::proto::VarType::FP16: + return "FLOAT16"; + default: + PADDLE_THROW( + paddle::platform::errors::Unavailable("Unsupported data type.")); + } +} + +Node *GetInputVarNode(const std::string &input_name, const Node *op_node, + const int id) { + auto var_name = op_node->Op()->Input(input_name).at(id); + return GetInputVarNodeByVarName(var_name, op_node); +} + +Node *GetOutputVarNode(const std::string &output_name, const Node *op_node, + const int id) { + auto var_name = op_node->Op()->Output(output_name).at(id); + return GetOutputVarNodeByVarName(var_name, op_node); +} + +Node *GetInputVarNodeByVarName(const std::string &var_name, + const Node *op_node) { + for (auto *var : op_node->inputs) { + if (var->Name() == var_name) { + return var; + } + } + return nullptr; +} + +Node *GetOutputVarNodeByVarName(const std::string &var_name, + const Node *op_node) { + for (auto *var : op_node->outputs) { + if (var->Name() == var_name) { + return var; + } + } + return nullptr; +} + +const bool is_float_equal(float a, float b, float eps) { + return std::fabs(a - b) <= eps; +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h new file mode 100644 index 00000000000000..c1b2bd0c8b5fd4 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/platform/device/ipu/ipu_utils.h" + +namespace paddle { +namespace platform { +namespace ipu { + +using framework::ir::Graph; +using framework::ir::Node; +using framework::OpDesc; + +#define REGISTER_HANDLER(name, func) \ + static bool __UNUSED_##name = \ + paddle::platform::ipu::RegisterHandler(#name, func) + +using SymbolHandler = std::function; + +std::unordered_map &SymbolHandlers(); + +bool RegisterHandler(const std::string &, const SymbolHandler &); + +SymbolHandler GetHandler(const std::string &); + +void ConnectNodes(Node *first_node, Node *next_node); +void DisConnectNodes(Node *first_node, Node *next_node); +void ClearNode(Node *node); +void CopyOpAttr(const std::string &attr_name, OpDesc *op, OpDesc *new_op, + bool override = false); + +const int VarType2OnnxDtype(const int type); +const std::string VarType2PopStr(const int type); + +Node *GetInputVarNode(const std::string &input_name, const Node *op_node, + const int id = 0); +Node *GetOutputVarNode(const std::string &output_name, const Node *op_node, + const int id = 0); +Node *GetInputVarNodeByVarName(const std::string &var_name, + const Node *op_node); +Node *GetOutputVarNodeByVarName(const std::string &var_name, + const Node *op_node); + +const bool is_float_equal(float a, float b, float eps = 1e-8); + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc new file mode 100644 index 00000000000000..f0c19cac3a6c3f --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/elementwise_ops.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { +namespace { + +Node *elementwise_op_handler(Graph *graph, Node *node, + const std::string &type) { + auto *op = node->Op(); + auto x_shape = GetInputVarNode("X", node)->Var()->GetShape(); + int64_t x_rank = x_shape.size(); + auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape(); + int64_t y_rank = y_shape.size(); + + auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); + if (axis == -1 || axis == x_rank - 1 || x_rank == y_rank) { + auto new_node = + CreateBaseOp(graph, node, type, + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + node->outputs); + return new_node; + } else { + auto y_new_shape = std::vector(x_rank, 1); + for (int i = axis; i < axis + y_rank; ++i) { + y_new_shape[i] = y_shape[i - axis]; + } + auto attrs = AttributeMap{ + {"value", y_new_shape}, + {"dims", std::vector{x_rank}}, + {"dtype", ONNXDataType::INT64}, + }; + // constant + auto new_node_const = CreateConst(graph, node, {}, {}, attrs); + // reshape + auto new_node_reshape = CreateBaseOp( + graph, node, "popart_reshape", + {GetInputVarNode("Y", node), new_node_const->outputs[0]}, {}); + // elementwise_op + auto new_node = + CreateBaseOp(graph, node, type, + {GetInputVarNode("X", node), new_node_reshape->outputs[0]}, + node->outputs); + return new_node; + } +} + +Node *elementwise_add_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_add"); +} + +Node *elementwise_sub_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_sub"); +} + +Node *elementwise_div_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_div"); +} + +Node *elementwise_mul_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_mul"); +} + +Node *elementwise_min_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_min"); +} + +Node *elementwise_max_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_max"); +} + +Node *elementwise_pow_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_pow"); +} + +Node *elementwise_mod_handler(Graph *graph, Node *node) { + return elementwise_op_handler(graph, node, "popart_mod"); +} + +REGISTER_HANDLER(elementwise_add, elementwise_add_handler); +REGISTER_HANDLER(elementwise_sub, elementwise_sub_handler); +REGISTER_HANDLER(elementwise_div, elementwise_div_handler); +REGISTER_HANDLER(elementwise_mul, elementwise_mul_handler); +REGISTER_HANDLER(elementwise_min, elementwise_min_handler); +REGISTER_HANDLER(elementwise_max, elementwise_max_handler); +REGISTER_HANDLER(elementwise_pow, elementwise_pow_handler); +REGISTER_HANDLER(elementwise_mod, elementwise_mod_handler); + +} // namespace +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc new file mode 100644 index 00000000000000..92362ebf5be7d5 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/logic_ops.cc @@ -0,0 +1,36 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { +namespace { + +Node *equal_handler(Graph *graph, Node *node) { + auto new_node = CreateBaseOp( + graph, node, "popart_equal", + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, node->outputs); + return new_node; +} + +REGISTER_HANDLER(equal, equal_handler); + +} // namespace +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc new file mode 100644 index 00000000000000..af7e4d0c7dbe9d --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/math_ops.cc @@ -0,0 +1,259 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { +namespace { + +Node *mean_handler(Graph *graph, Node *node) { + return CreateBaseOp(graph, node, "popart_reducemean", + {GetInputVarNode("X", node)}, + {GetOutputVarNode("Out", node)}, + { + {"keepdims", int64_t{0}}, + }); +} + +Node *pow_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + if (op->HasInput("FactorTensor") && !op->Input("FactorTensor").empty()) { + return CreateBaseOp( + graph, node, "popart_pow", + {GetInputVarNode("X", node), GetInputVarNode("FactorTensor", node)}, + node->outputs); + } else { + // Op(pow) -> Op(Constant)->Var(const_out)->Op(Pow) + auto value_ = BOOST_GET_CONST(float, op->GetAttr("factor")); + auto attrs = + MakeConstAttrMapFromValue(value_, {1}, ONNXDataType::FLOAT); + auto new_node_const = CreateConst(graph, node, {}, {}, attrs); + return CreateBaseOp(graph, node, "popart_pow", {GetInputVarNode("X", node), + new_node_const->outputs[0]}, + node->outputs); + } +} + +Node *mul_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto x_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("x_num_col_dims")); + auto y_num_col_dims = BOOST_GET_CONST(int, op->GetAttr("y_num_col_dims")); + auto x_shape_ = GetInputVarNode("X", node)->Var()->GetShape(); + auto y_shape_ = GetInputVarNode("Y", node)->Var()->GetShape(); + + // build the shape for reshape + std::vector reshape_shape_{}; + for (int left = 0; left < x_num_col_dims; left++) { + reshape_shape_.push_back(int64_t(x_shape_[left])); + } + for (int right = y_num_col_dims; right < y_shape_.size(); right++) { + reshape_shape_.push_back(int64_t(y_shape_[right])); + } + auto x_flatten = + CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("X", node)}, + {}, {{"axis", int64_t(x_num_col_dims)}}); + auto y_flatten = + CreateBaseOp(graph, node, "popart_flatten", {GetInputVarNode("Y", node)}, + {}, {{"axis", int64_t(y_num_col_dims)}}); + auto matmul = + CreateBaseOp(graph, node, "popart_matmul", + {x_flatten->outputs[0], y_flatten->outputs[0]}, {}, {}); + + auto reshape_const = CreateConst( + graph, node, {}, {}, + {{"value", reshape_shape_}, + {"dims", std::vector{int64_t(reshape_shape_.size())}}, + {"dtype", ONNXDataType::INT64}}); + return CreateBaseOp(graph, node, "popart_reshape", + {matmul->outputs[0], reshape_const->outputs[0]}, + node->outputs, {}); +} + +Node *matmul_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto transpose_x = BOOST_GET_CONST(bool, op->GetAttr("transpose_X")); + auto transpose_y = BOOST_GET_CONST(bool, op->GetAttr("transpose_Y")); + auto alpha = BOOST_GET_CONST(float, op->GetAttr("alpha")); + auto x_shape = GetInputVarNode("X", node)->Var()->GetShape(); + auto y_shape = GetInputVarNode("Y", node)->Var()->GetShape(); + + int x_rank = x_shape.size(); + std::vector perm; + if (x_rank == 1) { + perm = std::vector{0}; + } else if (x_rank == 2) { + return CreateGemm(graph, node, + {GetInputVarNode("X", node), GetInputVarNode("Y", node)}, + node->outputs, transpose_x, transpose_y, alpha); + } else if (x_rank == 3) { + perm = std::vector{0, 2, 1}; + } else if (x_rank == 4) { + perm = std::vector{0, 1, 3, 2}; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "op matmul with input rank == %d", x_rank)); + } + + Node *x_node = GetInputVarNode("X", node); + Node *y_node = GetInputVarNode("Y", node); + if (transpose_x) { + x_node = CreateBaseOp(graph, node, "popart_transpose", + {GetInputVarNode("X", node)}, {}, {{"perm", perm}}); + x_node = x_node->outputs[0]; + } + if (transpose_y) { + y_node = CreateBaseOp(graph, node, "popart_transpose", + {GetInputVarNode("Y", node)}, {}, {{"perm", perm}}); + y_node = y_node->outputs[0]; + } + if (is_float_equal(alpha, 1.0)) { + auto o_node = + CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, {}); + auto attr = MakeConstAttrMapFromValue(alpha, {1}, ONNXDataType::FLOAT); + auto const_node = CreateConst(graph, node, {}, {}, attr); + return CreateBaseOp(graph, node, "popart_mul", + {o_node->outputs[0], const_node->outputs[0]}, + node->outputs); + } else { + return CreateBaseOp(graph, node, "popart_matmul", {x_node, y_node}, + node->outputs); + } +} + +Node *sum_handler(Graph *graph, Node *node) { + return CreateBaseOp(graph, node, "popart_sum", node->inputs, node->outputs); +} + +Node *softmax_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto axis = BOOST_GET_CONST(int, op->GetAttr("axis")); + return CreateSoftmaxOpset11(graph, node, node->inputs, node->outputs, axis); +} + +Node *scale_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto scale_ = BOOST_GET_CONST(float, op->GetAttr("scale")); + auto bias_ = BOOST_GET_CONST(float, op->GetAttr("bias")); + auto bias_after_scale_ = + BOOST_GET_CONST(bool, op->GetAttr("bias_after_scale")); + auto data_type_ = GetInputVarNode("X", node)->Var()->GetDataType(); + + auto new_node_bias_var = + CreateConst(graph, node, {}, {}, {{"value", std::vector{bias_}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::FLOAT}}); + new_node_bias_var = new_node_bias_var->outputs[0]; + + Node *new_node_scale_var = nullptr; + if (op->HasInput("ScaleTensor") && !op->Input("ScaleTensor").empty()) { + new_node_scale_var = GetInputVarNode("ScaleTensor", node); + } else { + new_node_scale_var = + CreateConst(graph, node, {}, {}, {{"value", std::vector{scale_}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::FLOAT}}); + new_node_scale_var = new_node_scale_var->outputs[0]; + } + + // convert to float32 + auto new_node_cast = + CreateCast(graph, node, {GetInputVarNode("X", node)}, {}, + static_cast(framework::proto::VarType::FP32)); + Node *result = nullptr; + if (bias_after_scale_) { + auto new_node_mul = + CreateBaseOp(graph, node, "popart_mul", + {new_node_cast->outputs[0], new_node_scale_var}, {}, {}); + result = + CreateBaseOp(graph, node, "popart_add", + {new_node_mul->outputs[0], new_node_bias_var}, {}, {}); + } else { + auto new_node_add = + CreateBaseOp(graph, node, "popart_add", + {new_node_cast->outputs[0], new_node_bias_var}, {}, {}); + result = + CreateBaseOp(graph, node, "popart_mul", + {new_node_add->outputs[0], new_node_scale_var}, {}, {}); + } + auto result_after_cast = + CreateCast(graph, node, result->outputs, node->outputs, + static_cast(data_type_)); + return result_after_cast; +} + +Node *cross_entropy2_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto ignoreIndex = BOOST_GET_CONST(int, op->GetAttr("ignore_index")); + auto new_cast = CreateCast(graph, node, {GetInputVarNode("Label", node)}, {}, + framework::proto::VarType::INT32); + auto label_shape_ = GetInputVarNode("Label", node)->Var()->GetShape(); + if (label_shape_.size() == 1) { + return CreateBaseOp(graph, node, "popart_nllloss", + {GetInputVarNode("X", node), new_cast->outputs[0]}, + {GetOutputVarNode("Y", node)}, + { + {"ignoreIndex", ignoreIndex}, + }); + } else { + std::vector new_shape_{label_shape_[0]}; + auto const_before_loss = CreateBaseOp( + graph, node, "popart_constant", {}, {}, + {{"value", new_shape_}, + {"dims", + std::vector{static_cast(new_shape_.size())}}, + {"dtype", ONNXDataType::INT64}}); + + auto reshape_before_loss = CreateBaseOp( + graph, node, "popart_reshape", + {new_cast->outputs[0], const_before_loss->outputs[0]}, {}, {}); + + auto nllloss = CreateBaseOp( + graph, node, "popart_nllloss", + {GetInputVarNode("X", node), reshape_before_loss->outputs[0]}, {}, + { + {"ignoreIndex", ignoreIndex}, + }); + + auto const_after_loss = CreateBaseOp( + graph, node, "popart_constant", {}, {}, + {{"value", label_shape_}, + {"dims", + std::vector{static_cast(label_shape_.size())}}, + {"dtype", ONNXDataType::INT64}}); + + auto reshape_after_loss = + CreateBaseOp(graph, node, "popart_reshape", + {nllloss->outputs[0], const_after_loss->outputs[0]}, + {GetOutputVarNode("Y", node)}, {}); + return reshape_after_loss; + } +} + +REGISTER_HANDLER(mean, mean_handler); +REGISTER_HANDLER(pow, pow_handler); +REGISTER_HANDLER(mul, mul_handler); +REGISTER_HANDLER(matmul, matmul_handler); +REGISTER_HANDLER(sum, sum_handler); +REGISTER_HANDLER(softmax, softmax_handler); +REGISTER_HANDLER(scale, scale_handler); +REGISTER_HANDLER(cross_entropy2, cross_entropy2_handler); + +} // namespace +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc new file mode 100644 index 00000000000000..58f3e42b7387a7 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/nn_ops.cc @@ -0,0 +1,301 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { +namespace ipu { +namespace { + +Node *conv2d_handler(Graph *graph, Node *node) { + OpDesc *op = node->Op(); + auto dilations_ = BOOST_GET_CONST(std::vector, op->GetAttr("dilations")); + auto dilations = std::vector{dilations_.begin(), dilations_.end()}; + auto group_ = BOOST_GET_CONST(int, op->GetAttr("groups")); + auto pads_ = BOOST_GET_CONST(std::vector, op->GetAttr("paddings")); + if (pads_.size() == 2) { + pads_.push_back(pads_[0]); + pads_.push_back(pads_[1]); + } + auto pads = std::vector{pads_.begin(), pads_.end()}; + auto stride_ = BOOST_GET_CONST(std::vector, op->GetAttr("strides")); + auto stride = std::vector{stride_.begin(), stride_.end()}; + if (op->HasInput("Bias") && !op->Input("Bias").empty()) { + return CreateConv( + graph, node, + { + GetInputVarNode("Input", node), GetInputVarNode("Filter", node), + GetInputVarNode("Bias", node), + }, + node->outputs, dilations, group_, {}, pads, stride); + } else { + return CreateConv( + graph, node, + { + GetInputVarNode("Input", node), GetInputVarNode("Filter", node), + }, + node->outputs, dilations, group_, {}, pads, stride); + } +} + +Node *batch_norm_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + std::vector inputs; + inputs.push_back(GetInputVarNode("X", node)); + inputs.push_back(GetInputVarNode("Scale", node)); + inputs.push_back(GetInputVarNode("Bias", node)); + inputs.push_back(GetInputVarNode("Mean", node)); + inputs.push_back(GetInputVarNode("Variance", node)); + int64_t num_outputs = 1; + std::vector outputs; + auto is_test_type = op->GetAttrType("is_test"); + bool is_test; + if (is_test_type == 0) { + // int + is_test = BOOST_GET_CONST(int, op->GetAttr("is_test")); + } else { + // bool + is_test = BOOST_GET_CONST(bool, op->GetAttr("is_test")); + } + outputs.push_back(GetOutputVarNode("Y", node)); + if (!is_test) { + outputs.push_back(GetOutputVarNode("MeanOut", node)); + outputs.push_back(GetOutputVarNode("VarianceOut", node)); + outputs.push_back(GetOutputVarNode("SavedMean", node)); + outputs.push_back(GetOutputVarNode("SavedVariance", node)); + num_outputs = 5; + } + // outputs.push_back(GetOutputVarNode("ReserveSpace", node)); + auto momentum = BOOST_GET_CONST(float, op->GetAttr("momentum")); + auto epsilon = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + // data_layout + return CreateBaseOp(graph, node, "popart_batchnormalization", inputs, outputs, + { + {"momentum", momentum}, + {"epsilon", epsilon}, + {"num_outputs", num_outputs}, + }); +} + +Node *pool2d_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto pooling_type = BOOST_GET_CONST(std::string, op->GetAttr("pooling_type")); + auto global_pooling = BOOST_GET_CONST(bool, op->GetAttr("global_pooling")); + if (global_pooling) { + if (pooling_type == "max") { + return CreateBaseOp(graph, node, "popart_globalmaxpool", node->inputs, + node->outputs); + } else if (pooling_type == "avg") { + return CreateBaseOp(graph, node, "popart_globalaveragepool", node->inputs, + node->outputs); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "op pool2d with unkonwn pooling_type: %s", pooling_type)); + } + } + if (op->HasAttr("padding_algorithm")) { + auto padding_algorithm = + BOOST_GET_CONST(std::string, op->GetAttr("padding_algorithm")); + if (padding_algorithm != "EXPLICIT") { + PADDLE_THROW(platform::errors::InvalidArgument( + "op pool2d with unkonwn padding_algorithm: %s", padding_algorithm)); + } + } + + auto ksize = BOOST_GET_CONST(std::vector, op->GetAttr("ksize")); + auto kernel_shape = std::vector{ksize.begin(), ksize.end()}; + auto ceil_mode_ = BOOST_GET_CONST(bool, op->GetAttr("ceil_mode")); + auto ceil_mode = int64_t(ceil_mode_ ? 1 : 0); + auto paddings = BOOST_GET_CONST(std::vector, op->GetAttr("paddings")); + auto pads = std::vector{paddings.begin(), paddings.end()}; + if (pads.size() == 2) { + pads.push_back(paddings[0]); + pads.push_back(paddings[1]); + } + auto strides_ = BOOST_GET_CONST(std::vector, op->GetAttr("strides")); + auto strides = std::vector{strides_.begin(), strides_.end()}; + if (pooling_type == "max") { + int64_t num_outputs = 1; + auto dilations = std::vector{}; + int64_t storage_order = 0; + return CreateBaseOp(graph, node, "popart_maxpool", node->inputs, + node->outputs, { + {"num_outputs", num_outputs}, + {"kernel_shape", kernel_shape}, + {"ceil_mode", ceil_mode}, + {"dilations", dilations}, + {"pads", pads}, + {"storage_order", storage_order}, + {"strides", strides}, + }); + } else if (pooling_type == "avg") { + int64_t count_include_pad = 0; + return CreateBaseOp(graph, node, "popart_averagepool", node->inputs, + node->outputs, + { + {"kernel_shape", kernel_shape}, + {"ceil_mode", ceil_mode}, + {"count_include_pad", count_include_pad}, + {"pads", pads}, + {"strides", strides}, + }); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "op pool2d with unkonwn pooling_type: %s", pooling_type)); + } +} + +Node *group_norm_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + auto groups_ = BOOST_GET_CONST(int, op->GetAttr("groups")); + auto groups = int64_t{groups_}; + auto attrs_ = AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups}}; + + std::vector inputs_ = {GetInputVarNode("X", node), + GetInputVarNode("Scale", node), + GetInputVarNode("Bias", node)}; + std::vector outputs_ = {GetOutputVarNode("Y", node), + GetOutputVarNode("Mean", node), + GetOutputVarNode("Variance", node)}; + return CreateBaseOp(graph, node, "popart_groupnormalization_v2", inputs_, + outputs_, attrs_); +} + +Node *instance_norm_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + auto attrs_ = AttributeMap{{"epsilon", epsilon_}}; + + std::vector inputs_ = {GetInputVarNode("X", node), + GetInputVarNode("Scale", node), + GetInputVarNode("Bias", node)}; + std::vector outputs_ = {GetOutputVarNode("Y", node)}; + return CreateBaseOp(graph, node, "popart_instancenormalization", inputs_, + outputs_, attrs_); +} + +Node *layer_norm_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto begin_norm_axis_ = BOOST_GET_CONST(int, op->GetAttr("begin_norm_axis")); + auto input_shape_ = GetInputVarNode("X", node)->Var()->GetShape(); + + std::vector norm_shape_{1, 1}; + for (int i = 0; i < input_shape_.size(); i++) { + if (i < begin_norm_axis_) { + norm_shape_[0] *= input_shape_[i]; + } else { + norm_shape_[1] *= input_shape_[i]; + } + } + + auto attrs1 = AttributeMap{ + {"value", norm_shape_}, + {"dims", std::vector{static_cast(norm_shape_.size())}}, + {"dtype", ONNXDataType::INT64}}; + auto reshape1_const = + CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs1); + auto new_node_reshape1 = CreateBaseOp( + graph, node, "popart_reshape", + {GetInputVarNode("X", node), reshape1_const->outputs[0]}, {}, {}); + + auto epsilon_ = BOOST_GET_CONST(float, op->GetAttr("epsilon")); + int64_t groups_ = 1; + auto groupnorm_attrs_ = + AttributeMap{{"epsilon", epsilon_}, {"num_groups", groups_}}; + auto out_Y_ = MakeVarNode(graph, node); + CreateBaseOp(graph, node, "popart_groupnormalization_v2", + {new_node_reshape1->outputs[0], GetInputVarNode("Scale", node), + GetInputVarNode("Bias", node)}, + {out_Y_, GetOutputVarNode("Mean", node), + GetOutputVarNode("Variance", node)}, + groupnorm_attrs_); + + auto attrs2 = AttributeMap{ + {"value", input_shape_}, + {"dims", std::vector{static_cast(input_shape_.size())}}, + {"dtype", ONNXDataType::INT64}}; + auto reshape2_const = + CreateBaseOp(graph, node, "popart_constant", {}, {}, attrs2); + auto new_node_reshape2 = CreateBaseOp(graph, node, "popart_reshape", + {out_Y_, reshape2_const->outputs[0]}, + {GetOutputVarNode("Y", node)}, {}); + return new_node_reshape2; +} + +Node *dropout_handler(Graph *graph, Node *node) { + auto *op = node->Op(); + auto dropout_prob_ = BOOST_GET_CONST(float, op->GetAttr("dropout_prob")); + auto dropout_implementation_ = + BOOST_GET_CONST(std::string, op->GetAttr("dropout_implementation")); + auto is_test_type_ = op->GetAttrType("is_test"); + bool is_test_; + if (is_test_type_ == 0) { + // int + is_test_ = BOOST_GET_CONST(int, op->GetAttr("is_test")); + } else { + // bool + is_test_ = BOOST_GET_CONST(bool, op->GetAttr("is_test")); + } + + if (is_test_) { + if (dropout_implementation_ == "upscale_in_train") { + return CreateBaseOp(graph, node, "popart_identity", + {GetInputVarNode("X", node)}, + {GetOutputVarNode("Out", node)}, {}); + } else if (dropout_implementation_ == "downgrade_in_infer") { + auto scale = + CreateConst(graph, node, {}, {}, + {{"value", std::vector{1 - dropout_prob_}}, + {"dims", std::vector{1}}, + {"dtype", ONNXDataType::FLOAT}}); + return CreateBaseOp(graph, node, "popart_mul", + {GetInputVarNode("X", node), scale->outputs[0]}, + {GetOutputVarNode("Out", node)}, {}); + } else { + PADDLE_THROW( + platform::errors::InvalidArgument("Invalid dropout_implementation")); + } + } else { + if (dropout_implementation_ == "upscale_in_train") { + auto attrs_ = + AttributeMap{{"num_outputs", (int64_t)1}, {"ratio", dropout_prob_}}; + return CreateBaseOp(graph, node, "popart_dropout", + {GetInputVarNode("X", node)}, + {GetOutputVarNode("Out", node)}, attrs_); + } else if (dropout_implementation_ == "downgrade_in_infer") { + PADDLE_THROW(platform::errors::InvalidArgument( + "Do not support downgrade_in_infer with training")); + } else { + PADDLE_THROW( + platform::errors::InvalidArgument("Invalid dropout_implementation")); + } + } +} + +REGISTER_HANDLER(pool2d, pool2d_handler); +REGISTER_HANDLER(batch_norm, batch_norm_handler); +REGISTER_HANDLER(group_norm, group_norm_handler); +REGISTER_HANDLER(instance_norm, instance_norm_handler); +REGISTER_HANDLER(layer_norm, layer_norm_handler); +REGISTER_HANDLER(conv2d, conv2d_handler); +REGISTER_HANDLER(dropout, dropout_handler); + +} // namespace +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc new file mode 100644 index 00000000000000..b7a3a8ca7c60f5 --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.cc @@ -0,0 +1,195 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h" + +namespace paddle { +namespace platform { +namespace ipu { + +// singleton +static int var_count = 0; +static int op_count = 0; + +const std::string GenerateVarName() { + return std::string("_gen_var_") + std::to_string(var_count++); +} + +const std::string GenerateOpName() { + return std::string("_gen_op_") + std::to_string(op_count++); +} + +const std::string CreateOpIdentifyId(Node *node) { + // format: op_type|out_var0|out_var1|...|_gen_* + // this name will be used as op name when exporting onnx model from popart + auto op_type = node->Name(); + std::string op_out = ""; + for (auto *out_node : node->outputs) { + op_out += "|"; + op_out += out_node->Name(); + } + return {op_type + op_out + "|" + GenerateOpName()}; +} + +Node *MakeVarNode(Graph *graph, Node *node) { + auto var_name = GenerateVarName(); + auto var_desc = std::make_unique(var_name); + + auto var = graph->CreateVarNode(var_desc.get()); + return var; +} + +Node *MakeOpNode(Graph *graph, Node *node, const std::string &type, + const std::vector &inputs, + const std::vector &outputs) { + auto op_desc = std::make_unique(); + op_desc->SetType(type); + auto op = graph->CreateOpNode(op_desc.get()); + + for (auto *in : inputs) { + ConnectNodes(in, op); + } + if (outputs.empty()) { + auto var = MakeVarNode(graph, node); + ConnectNodes(op, var); + } else { + for (auto *out : outputs) { + ConnectNodes(op, out); + } + } + + // i/o + std::vector input_names; + for (auto node : op->inputs) { + input_names.push_back(node->Name()); + } + op->Op()->SetInput("__inputs__", input_names); + std::vector output_names; + for (auto node : op->outputs) { + output_names.push_back(node->Name()); + } + op->Op()->SetOutput("__outputs__", output_names); + op->Op()->Flush(); + + return op; +} + +Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type, + const std::vector &inputs, + const std::vector &outputs, + const AttributeMap &attrs) { + auto new_node = MakeOpNode(graph, node, type, inputs, outputs); + if (!attrs.empty()) { + new_node->Op()->SetAttrMap(attrs); + } + // deal special attr + if (!new_node->Op()->HasAttr(sIpuIndexAttr)) { + CopyOpAttr(sIpuIndexAttr, node->Op(), new_node->Op()); + } + if (!new_node->Op()->HasAttr(sIpuStageAttr)) { + CopyOpAttr(sIpuStageAttr, node->Op(), new_node->Op()); + } + { + new_node->Op()->SetAttr(sOpIdentifyIdAttr, CreateOpIdentifyId(node)); + new_node->Op()->Flush(); + } + + return new_node; +} + +Node *CreateConst(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, + const AttributeMap &attrs) { + return CreateBaseOp(graph, node, "popart_constant", inputs, outputs, attrs); +} + +Node *CreateCast(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, const int otype) { + auto to = VarType2PopStr(otype); + return CreateBaseOp(graph, node, "popart_cast", inputs, outputs, + {{"to", to}}); +} + +Node *CreateGemm(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, int64_t transA, + int64_t transB, float alpha, float beta) { + return CreateBaseOp(graph, node, "popart_gemm", inputs, outputs, + { + {"alpha", alpha}, + {"beta", beta}, + {"transA", transA}, + {"transB", transB}, + }); +} + +Node *CreateReshape(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, + const std::vector &oshape) { + auto attr = AttributeMap{ + {"value", oshape}, + {"dims", std::vector{static_cast(oshape.size())}}, + {"dtype", ONNXDataType::INT64}}; + auto new_node_const = + CreateBaseOp(graph, node, "popart_constant", {}, {}, attr); + auto new_node_reshape = + CreateBaseOp(graph, node, "popart_reshape", + {inputs[0], new_node_const->outputs[0]}, outputs); + return new_node_reshape; +} + +Node *CreateConv(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, + const std::vector &dilations, int64_t group, + const std::vector &kernel_shape, + const std::vector &pads, + const std::vector &strides) { + auto attrs = AttributeMap{ + {"dilations", dilations}, {"group", group}, + {"kernel_shape", kernel_shape}, {"pads", pads}, + {"strides", strides}, + }; + return CreateBaseOp(graph, node, "popart_conv", inputs, outputs, attrs); +} + +Node *CreateSoftmaxOpset11(Graph *graph, Node *node, + const std::vector &inputs, + const std::vector &outputs, int64_t axis) { + PADDLE_ENFORCE_EQ(inputs.size(), 1, platform::errors::InvalidArgument( + "Softmax op only support one input")); + auto x_shape = inputs[0]->Var()->GetShape(); + int x_rank = x_shape.size(); + if (axis < 0) { + axis = axis + x_rank; + } + if (axis == x_rank - 1) { + return CreateBaseOp(graph, node, "popart_softmax", inputs, outputs, + {{"axis", int64_t{-1}}}); + } else { + auto perm = std::vector(x_rank); + std::iota(perm.begin(), perm.end(), 0); + perm[x_rank - 1] = axis; + perm[axis] = x_rank - 1; + auto new_transpose_pre = CreateBaseOp(graph, node, "popart_transpose", + inputs, {}, {{"perm", perm}}); + auto new_softmax = + CreateBaseOp(graph, node, "popart_softmax", new_transpose_pre->outputs, + {}, {{"axis", int64_t{-1}}}); + return CreateBaseOp(graph, node, "popart_transpose", new_softmax->outputs, + outputs, {{"perm", perm}}); + } +} + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h new file mode 100644 index 00000000000000..7e70e56ef9166c --- /dev/null +++ b/paddle/fluid/platform/device/ipu/popart_canonicalization/op_builder.h @@ -0,0 +1,85 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/device/ipu/common.h" +#include "paddle/fluid/platform/device/ipu/popart_canonicalization/canonicalization_utils.h" + +namespace paddle { +namespace platform { +namespace ipu { + +using paddle::framework::AttributeMap; + +template +AttributeMap MakeConstAttrMap(std::vector value, std::vector dims, + int dtype) { + return AttributeMap{{"value", value}, {"dims", dims}, {"dtype", dtype}}; +} + +template +AttributeMap MakeConstAttrMapFromValue(T v, std::vector dims, + int dtype) { + size_t size = 1; + for (auto &dim : dims) { + size *= dim; + } + return MakeConstAttrMap(std::vector(size, v), dims, dtype); +} + +const std::string GenerateVarName(); +const std::string CreateOpIdentifyId(Node *node); + +Node *MakeVarNode(Graph *graph, Node *node); +Node *MakeOpNode(Graph *graph, Node *node, const std::string &type, + const std::vector &inputs, + const std::vector &outputs); + +Node *CreateBaseOp(Graph *graph, Node *node, const std::string &type, + const std::vector &inputs, + const std::vector &outputs, + const AttributeMap &attrs = {}); + +Node *CreateConst(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, + const AttributeMap &attrs); + +// otype is proto::VarType::Type +Node *CreateCast(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, const int otype); + +Node *CreateGemm(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, int64_t transA = 0, + int64_t transB = 0, float alpha = 1.0f, float beta = 1.0f); + +Node *CreateReshape(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, + const std::vector &oshape); + +Node *CreateConv(Graph *graph, Node *node, const std::vector &inputs, + const std::vector &outputs, + const std::vector &dilations = {1, 1}, + int64_t group = 1, + const std::vector &kernel_shape = {}, + const std::vector &pads = {0, 0, 0, 0}, + const std::vector &strides = {1, 1}); + +Node *CreateSoftmaxOpset11(Graph *graph, Node *node, + const std::vector &inputs, + const std::vector &outputs, int64_t axis); + +} // namespace ipu +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h index 4cd7f928f6e22b..763c5a46abe287 100644 --- a/paddle/fluid/platform/device/ipu/supported_ops_autogen.h +++ b/paddle/fluid/platform/device/ipu/supported_ops_autogen.h @@ -195,3 +195,5 @@ OP_DECL(popart_sqrt, aiOnnxOpset.sqrt, NONE) // NOLINT OP_DECL(popart_tanh, aiOnnxOpset.tanh, NONE) // NOLINT OP_DECL(popart_tile, aiOnnxOpset.tile, NONE) // NOLINT OP_DECL(popart_transpose, aiOnnxOpset.transpose, ARG(INT_VEC,perm) ) // NOLINT + +// clang-format on diff --git a/paddle/fluid/pybind/reader_py.cc b/paddle/fluid/pybind/reader_py.cc index 60b99a964a57fe..d4fa1b2c89abf3 100644 --- a/paddle/fluid/pybind/reader_py.cc +++ b/paddle/fluid/pybind/reader_py.cc @@ -37,6 +37,9 @@ PADDLE_DEFINE_EXPORTED_bool( "If set true, the queue.pop will only get data from queue but not " "remove the data from queue for speed testing"); +// disable auto conversion to list in Python +PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray); + namespace paddle { namespace pybind { diff --git a/paddle/pten/api/lib/CMakeLists.txt b/paddle/pten/api/lib/CMakeLists.txt index 96ad9ade8e3ad5..d1e60c4505d6b0 100644 --- a/paddle/pten/api/lib/CMakeLists.txt +++ b/paddle/pten/api/lib/CMakeLists.txt @@ -22,6 +22,10 @@ set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/pten/api/lib/api.cc) set(api_header_file_tmp ${api_header_file}.tmp) set(api_source_file_tmp ${api_source_file}.tmp) +if (NOT PYTHON_EXECUTABLE) + find_package(PythonInterp REQUIRED) +endif() + add_custom_command( OUTPUT ${api_header_file} ${api_source_file} COMMAND ${PYTHON_EXECUTABLE} -m pip install pyyaml diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h new file mode 100644 index 00000000000000..8c21094a4af202 --- /dev/null +++ b/paddle/pten/api/lib/kernel_declare.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/kernel_registry.h" + +// TODO(chenweihang) After the kernel is split into a single file, +// the kernel declare statement is automatically generated according to the +// file name of the kernel, and this header file will be removed + +PT_DECLARE_KERNEL(full_like, CPU); +PT_DECLARE_KERNEL(dot, CPU); +PT_DECLARE_KERNEL(flatten, CPU); +PT_DECLARE_KERNEL(sign, CPU); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PT_DECLARE_KERNEL(full_like, CUDA); +PT_DECLARE_KERNEL(dot, CUDA); +PT_DECLARE_KERNEL(flatten, CUDA); +PT_DECLARE_KERNEL(sign, CUDA); +#endif + +#ifdef PADDLE_WITH_XPU +PT_DECLARE_KERNEL(flatten, XPU); +#endif diff --git a/paddle/pten/api/lib/utils.cc b/paddle/pten/api/lib/utils.cc index e17b19d9f689e1..bfde9b14b0020d 100644 --- a/paddle/pten/api/lib/utils.cc +++ b/paddle/pten/api/lib/utils.cc @@ -25,10 +25,14 @@ limitations under the License. */ #include "paddle/pten/include/core.h" #include "paddle/pten/include/infermeta.h" -PT_DECLARE_MODULE(UtilsCPU); +PT_DECLARE_KERNEL(copy, CPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_DECLARE_MODULE(UtilsCUDA); +PT_DECLARE_KERNEL(copy, CUDA); +#endif + +#ifdef PADDLE_WITH_XPU +PT_DECLARE_KERNEL(copy, XPU); #endif namespace paddle { diff --git a/paddle/pten/core/kernel_registry.h b/paddle/pten/core/kernel_registry.h index cd6fa80906cfbd..be624177dfb14c 100644 --- a/paddle/pten/core/kernel_registry.h +++ b/paddle/pten/core/kernel_registry.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include #include @@ -24,6 +25,8 @@ #include "paddle/pten/core/kernel_factory.h" #include "paddle/pten/core/kernel_utils.h" +#include "paddle/fluid/platform/enforce.h" + namespace pten { #define BACKEND(arg__) pten::Backend::arg__ @@ -193,64 +196,35 @@ struct KernelRegistrar { #define _PT_ARG_N(args) _PT_ARG_N_EXPAND args #define _PT_RESQ_N() 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 +/** PT_REGISTER_KERNEL + * + * The most frequently used kernel registration macro, used for kernel + * registration with only data type as template parameter, and the function + * pointer of the corresponding data type is automatically instantiated + * during registration. + */ #define PT_REGISTER_KERNEL( \ kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - _PT_REGISTER_KERNEL(kernel_name, \ - PT_ID, \ - backend, \ - layout, \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__) + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + pt_register_kernel_ns_check_##kernel_name, \ + "PT_REGISTER_KERNEL must be called in global namespace."); \ + _PT_REGISTER_KERNEL( \ + kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, __VA_ARGS__) + #ifndef _WIN32 -#define _PT_REGISTER_KERNEL( \ - kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \ - "PT_REGISTER_KERNEL must be called in global namespace."); \ - PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__); \ - static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \ - func_id)(::pten::Kernel*); \ - PT_KERNEL_REGISTRAR_INIT(kernel_name, \ - func_id, \ - backend, \ - layout, \ - &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__); \ - void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \ - func_id)(::pten::Kernel * kernel) +#define _PT_REGISTER_KERNEL( \ + kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ + PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, __VA_ARGS__); \ + static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \ + PT_KERNEL_REGISTRAR_INIT(kernel_name, \ + backend, \ + layout, \ + &__PT_KERNEL_args_def_FN_##kernel_name, \ + meta_kernel_fn, \ + cpp_dtype, \ + __VA_ARGS__); \ + void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel) #else -#define _PT_REGISTER_KERNEL( \ - kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \ - "PT_REGISTER_KERNEL must be called in global namespace."); \ - static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \ - func_id)(::pten::Kernel*); \ - PT_KERNEL_REGISTRAR_INIT(kernel_name, \ - func_id, \ - backend, \ - layout, \ - &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__); \ - void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \ - func_id)(::pten::Kernel * kernel) -#endif - -#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \ - _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__) - -#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \ - PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \ - (meta_kernel_fn, cpp_dtype, __VA_ARGS__) - /** * `template decltype(fn) fn` can work on gcc and clang, * but msvc will failed, error like: @@ -261,8 +235,30 @@ struct KernelRegistrar { * * https://stackoverflow.com/questions/63989585/explicit-instantiation-of-function-using-decltype-work-on-g-but-not-on-visua * - * So we solve the explict instantiation of kernel by CMake + * And msvc can work without template instantiation */ +#define _PT_REGISTER_KERNEL( \ + kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ + static void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \ + PT_KERNEL_REGISTRAR_INIT(kernel_name, \ + backend, \ + layout, \ + &__PT_KERNEL_args_def_FN_##kernel_name, \ + meta_kernel_fn, \ + cpp_dtype, \ + __VA_ARGS__); \ + void __PT_KERNEL_args_def_FN_##kernel_name(::pten::Kernel* kernel) +#endif + +#define PT_KERNEL_INSTANTIATION(meta_kernel_fn, cpp_dtype, ...) \ + _PT_KERNEL_INSTANTIATION(PT_NARGS(cpp_dtype, __VA_ARGS__), \ + meta_kernel_fn, \ + cpp_dtype, \ + __VA_ARGS__) + +#define _PT_KERNEL_INSTANTIATION(N, meta_kernel_fn, cpp_dtype, ...) \ + PT_CONCATENATE(_PT_KERNEL_INSTANTIATION_, N) \ + (meta_kernel_fn, cpp_dtype, __VA_ARGS__) #define _PT_KERNEL_INSTANTIATION_1(meta_kernel_fn, cpp_dtype, ...) \ template decltype(meta_kernel_fn) meta_kernel_fn @@ -309,22 +305,15 @@ struct KernelRegistrar { template decltype(meta_kernel_fn) meta_kernel_fn; \ PT_EXPAND(_PT_KERNEL_INSTANTIATION_14(meta_kernel_fn, __VA_ARGS__)) -#define PT_KERNEL_REGISTRAR_INIT(kernel_name, \ - func_id, \ - backend, \ - layout, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ - ...) \ - _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \ - kernel_name, \ - func_id, \ - backend, \ - layout, \ - args_def_fn, \ - meta_kernel_fn, \ - cpp_dtype, \ +#define PT_KERNEL_REGISTRAR_INIT( \ + kernel_name, backend, layout, args_def_fn, meta_kernel_fn, cpp_dtype, ...) \ + _PT_KERNEL_REGISTRAR_INIT(PT_NARGS(cpp_dtype, __VA_ARGS__), \ + kernel_name, \ + backend, \ + layout, \ + args_def_fn, \ + meta_kernel_fn, \ + cpp_dtype, \ __VA_ARGS__) // clang-format off @@ -333,7 +322,6 @@ struct KernelRegistrar { and multi-line macros cannot be skipped with NOLINT.*/ #define _PT_KERNEL_REGISTRAR_INIT(N, \ kernel_name, \ - func_id, \ backend, \ layout, \ args_def_fn, \ @@ -342,7 +330,6 @@ struct KernelRegistrar { ...) \ PT_CONCATENATE(_PT_KERNEL_REGISTRAR_INIT_, N) ( \ kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -354,7 +341,6 @@ struct KernelRegistrar { // clang-format on #define _PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -363,17 +349,17 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ ::pten::KernelArgsParseFunctor)>::Parse, \ args_def_fn, \ - PT_KERNEL(meta_kernel_fn)); + PT_KERNEL(meta_kernel_fn)); \ + int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } #define _PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -382,8 +368,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -392,7 +378,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_1(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -400,7 +385,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -409,8 +393,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -419,7 +403,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_2(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -427,7 +410,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -436,8 +418,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -446,7 +428,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_3(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -454,7 +435,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -463,8 +443,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -473,7 +453,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_4(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -481,7 +460,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -490,8 +468,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -500,7 +478,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_5(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -508,7 +485,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -517,8 +493,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -527,7 +503,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_6(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -535,7 +510,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -544,8 +518,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -554,7 +528,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_7(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -562,7 +535,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -571,8 +543,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -581,7 +553,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_8(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -589,7 +560,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -598,8 +568,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -608,7 +578,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_9(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -616,7 +585,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -625,8 +593,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -635,7 +603,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_10(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -643,7 +610,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -652,8 +618,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -662,7 +628,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_11(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -670,7 +635,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -679,8 +643,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -689,7 +653,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_12(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -697,7 +660,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -706,8 +668,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -716,7 +678,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_13(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -724,7 +685,6 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) #define _PT_KERNEL_REGISTRAR_INIT_15(kernel_name, \ - func_id, \ registrar_id, \ backend, \ layout, \ @@ -733,8 +693,8 @@ struct KernelRegistrar { cpp_dtype, \ ...) \ static const ::pten::KernelRegistrar PT_CONCATENATE( \ - __reg_pt_op_kernel_##func_id##_, registrar_id)( \ - kernel_name, \ + __reg_pt_kernel_##kernel_name##_, registrar_id)( \ + #kernel_name, \ BACKEND(backend), \ DATALAYOUT(layout), \ ::paddle::experimental::CppTypeToDataType::Type(), \ @@ -743,7 +703,6 @@ struct KernelRegistrar { args_def_fn, \ PT_KERNEL(meta_kernel_fn)); \ PT_EXPAND(_PT_KERNEL_REGISTRAR_INIT_14(kernel_name, \ - func_id, \ PT_ID, \ backend, \ layout, \ @@ -751,90 +710,59 @@ struct KernelRegistrar { meta_kernel_fn, \ __VA_ARGS__)) -#define PT_REGISTER_KERNEL_STANDARD( \ - kernel_name, backend, layout, dtype, kernel_fn) \ - _PT_REGISTER_KERNEL_STANDARD( \ - kernel_name, PT_ID, backend, layout, dtype, kernel_fn) - -#define _PT_REGISTER_KERNEL_STANDARD( \ - kernel_name, func_id, backend, layout, dtype, kernel_fn) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \ - "_PT_REGISTER_KERNEL_STANDARD must be called in global namespace."); \ - template decltype(kernel_fn) kernel_fn; \ - static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \ - func_id)(::pten::Kernel*); \ - static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \ - func_id)( \ - kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - DATATYPE(dtype), \ - ::pten::KernelArgsParseFunctor::Parse, \ - args_def_fn, \ - PT_KERNEL(kernel_fn)); \ - void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id)(::pten::Kernel*) - -// use to declare symbol -#define PT_REGISTER_MODULE(name) \ - int RegisterSymbolsFor##name() { return 0; } - -#define PT_DECLARE_MODULE(name) \ - extern int RegisterSymbolsFor##name(); \ - UNUSED static int use_kernel_module_##name = RegisterSymbolsFor##name() - -// only used in cpp tests - -#define PT_REGISTER_KERNEL_FOR_TEST( \ - kernel_name, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - _PT_REGISTER_KERNEL_FOR_TEST(kernel_name, \ - PT_ID, \ - backend, \ - layout, \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__) - -#define _PT_REGISTER_KERNEL_FOR_TEST( \ - kernel_name, func_id, backend, layout, meta_kernel_fn, cpp_dtype, ...) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - PT_CONCATENATE(pt_op_kernel_for_test_ns_check_, func_id), \ - "PT_REGISTER_KERNEL must be called in global namespace."); \ - static void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, \ - func_id)(::pten::Kernel*); \ - PT_KERNEL_REGISTRAR_INIT( \ - kernel_name, \ - func_id, \ - backend, \ - layout, \ - &PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, func_id), \ - meta_kernel_fn, \ - cpp_dtype, \ - __VA_ARGS__); \ - void PT_CONCATENATE(__PT_KERNEL_for_test_args_def_FN_, \ - func_id)(::pten::Kernel * kernel) - -#define PT_REGISTER_KERNEL_WITH_NO_TYPE( \ - kernel_name, backend, layout, meta_kernel_fn) \ - _PT_REGISTER_KERNEL_WITH_NO_TYPE( \ - kernel_name, PT_ID, backend, layout, meta_kernel_fn) - -#define _PT_REGISTER_KERNEL_WITH_NO_TYPE( \ - kernel_name, func_id, backend, layout, meta_kernel_fn) \ - PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ - PT_CONCATENATE(pt_op_kernel_ns_check_, func_id), \ - "PT_REGISTER_KERNEL must be called in global namespace."); \ - decltype(meta_kernel_fn) meta_kernel_fn; \ - static void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \ - func_id)(::pten::Kernel*); \ - static const ::pten::KernelRegistrar PT_CONCATENATE(__reg_pt_op_kernel_, \ - func_id)( \ - kernel_name, \ - BACKEND(backend), \ - DATALAYOUT(layout), \ - ::pten::KernelArgsParseFunctor::Parse, \ - &PT_CONCATENATE(__PT_KERNEL_args_def_FN_, func_id), \ - PT_KERNEL(meta_kernel_fn)); \ - void PT_CONCATENATE(__PT_KERNEL_args_def_FN_, \ - func_id)(::pten::Kernel * kernel) +/** PT_REGISTER_SINGLE_KERNEL + * + * Used to register a single kernel, pass in the complete function pointer + * of the kernel, this registration macro will not do automatic template + * instantiation. + */ +#define PT_REGISTER_SINGLE_KERNEL( \ + kernel_name, backend, layout, dtype, kernel_fn) \ + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + pt_register_single_kernel_ns_check_##kernel_name, \ + "PT_REGISTER_SINGLE_KERNEL must be called in global namespace."); \ + static void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*); \ + static const ::pten::KernelRegistrar __reg_pt_single_kernel_##kernel_name( \ + #kernel_name, \ + BACKEND(backend), \ + DATALAYOUT(layout), \ + DATATYPE(dtype), \ + ::pten::KernelArgsParseFunctor::Parse, \ + args_def_fn, \ + PT_KERNEL(kernel_fn)); \ + int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \ + void __PT_SINGLE_KERNEL_args_def_FN_##kernel_name(::pten::Kernel*) + +/** PT_REGISTER_KERNEL_ALL_DTYPE + * + * Used to register a kernel that supports all data types, such as copy and + * reshape that are not sensitive to data types. + */ +#define PT_REGISTER_KERNEL_ALL_DTYPE(kernel_name, backend, layout, kernel_fn) \ + PT_STATIC_ASSERT_GLOBAL_NAMESPACE( \ + pt_register_kernel_all_dtype_ns_check_##kernel_name, \ + "PT_REGISTER_KERNEL_ALL_DTYPE must be called in global namespace."); \ + static void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name( \ + ::pten::Kernel*); \ + static const ::pten::KernelRegistrar \ + __reg_pt_kernel_all_dtype_##kernel_name( \ + #kernel_name, \ + BACKEND(backend), \ + DATALAYOUT(layout), \ + ::pten::KernelArgsParseFunctor::Parse, \ + &__PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name, \ + PT_KERNEL(kernel_fn)); \ + int TouchKernelSymbolFor_##kernel_name##_##backend() { return 0; } \ + void __PT_KERNEL_ALL_DTYPE_args_def_FN_##kernel_name(::pten::Kernel* kernel) + +/** PT_DECLARE_KERNEL + * + * Used to export the symbols of the file where the kernel is located, + * to avoid being removed by linker + */ +#define PT_DECLARE_KERNEL(kernel_name, backend) \ + extern int TouchKernelSymbolFor_##kernel_name##_##backend(); \ + UNUSED static int __declare_kernel_symbol_for_##kernel_name##_##backend = \ + TouchKernelSymbolFor_##kernel_name##_##backend() + } // namespace pten diff --git a/paddle/pten/kernels/cpu/creation.cc b/paddle/pten/kernels/cpu/creation.cc index 4f09fc489f8f67..4175203410f8da 100644 --- a/paddle/pten/kernels/cpu/creation.cc +++ b/paddle/pten/kernels/cpu/creation.cc @@ -61,9 +61,7 @@ void FillConstant(const CPUContext& dev_ctx, } // namespace pten -PT_REGISTER_MODULE(CreationCPU); - -PT_REGISTER_KERNEL("full_like", +PT_REGISTER_KERNEL(full_like, CPU, ANY, pten::FillAnyLike, @@ -74,7 +72,7 @@ PT_REGISTER_KERNEL("full_like", bool, paddle::platform::float16) {} -PT_REGISTER_KERNEL("full", +PT_REGISTER_KERNEL(full, CPU, ANY, pten::FillConstant, diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc index 32411560b55168..7ffac0537b60c0 100644 --- a/paddle/pten/kernels/cpu/linalg.cc +++ b/paddle/pten/kernels/cpu/linalg.cc @@ -70,12 +70,10 @@ void Matmul(const CPUContext& dev_ctx, } // namespace pten -PT_REGISTER_MODULE(LinalgCPU); - using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_KERNEL("dot", +PT_REGISTER_KERNEL(dot, CPU, ANY, pten::Dot, @@ -87,5 +85,4 @@ PT_REGISTER_KERNEL("dot", complex128) {} PT_REGISTER_KERNEL( - "matmul_v2", CPU, ANY, pten::Matmul, float, double, complex64, complex128) { -} + matmul_v2, CPU, ANY, pten::Matmul, float, double, complex64, complex128) {} diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc index e0e9cefbf671bf..61c6cb57a9f780 100644 --- a/paddle/pten/kernels/cpu/manipulation.cc +++ b/paddle/pten/kernels/cpu/manipulation.cc @@ -130,12 +130,9 @@ void Cast(const CPUContext& dev_ctx, } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(ManipulationCPU); - // TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel // architecture, kernel_name should be "flatten". -PT_REGISTER_KERNEL("flatten", +PT_REGISTER_KERNEL(flatten, CPU, ANY, pten::Flatten, @@ -145,8 +142,7 @@ PT_REGISTER_KERNEL("flatten", int8_t, int, int64_t) {} - -PT_REGISTER_KERNEL("flatten.mid", +PT_REGISTER_KERNEL(flatten_mid, CPU, ANY, pten::FlattenWithXShape, @@ -156,7 +152,8 @@ PT_REGISTER_KERNEL("flatten.mid", int8_t, int, int64_t) {} -PT_REGISTER_KERNEL("cast", + +PT_REGISTER_KERNEL(cast, CPU, ANY, pten::Cast, @@ -174,42 +171,33 @@ PT_REGISTER_KERNEL("cast", kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); } -// TODO(yuanrisheng): "reshape2" is compatible with old kernel -// architecture, kernel_name should be "reshape". -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape", - CPU, - ANY, - pten::ReshapeFromVectorVal) {} - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid", - CPU, - ANY, - pten::ReshapeFromVectorValWithXShape) {} - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host", CPU, ANY, pten::ReshapeFromDT) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CPU, ANY, pten::ReshapeFromVectorVal) {} +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid, + CPU, + ANY, + pten::ReshapeFromVectorValWithXShape) {} +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CPU, ANY, pten::ReshapeFromDT) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid", - CPU, - ANY, - pten::ReshapeFromDTWithXShape) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid, + CPU, + ANY, + pten::ReshapeFromDTWithXShape) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost", - CPU, - ANY, - pten::ReshapeFromVectorDT) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost, + CPU, + ANY, + pten::ReshapeFromVectorDT) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid", - CPU, - ANY, - pten::ReshapeFromVectorDTWithXShape) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid, + CPU, + ANY, + pten::ReshapeFromVectorDTWithXShape) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc index ddfb2f5f458540..2d556d96c2fcf7 100644 --- a/paddle/pten/kernels/cpu/math.cc +++ b/paddle/pten/kernels/cpu/math.cc @@ -106,18 +106,14 @@ DEFINE_CPU_ELEMENTWISE_OP(Mul) } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(MathCPU); - using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 // using bfloat16 = ::paddle::platform::bfloat16; - -PT_REGISTER_KERNEL("sign", CPU, ANY, pten::Sign, float, double) {} -PT_REGISTER_KERNEL("mean", CPU, ANY, pten::Mean, float, double, bool) {} -PT_REGISTER_KERNEL("scale", +PT_REGISTER_KERNEL(sign, CPU, ANY, pten::Sign, float, double) {} +PT_REGISTER_KERNEL(mean, CPU, ANY, pten::Mean, float, double, bool) {} +PT_REGISTER_KERNEL(scale, CPU, ANY, pten::Scale, @@ -129,8 +125,7 @@ PT_REGISTER_KERNEL("scale", int16_t, int, int64_t) {} - -PT_REGISTER_KERNEL("add", +PT_REGISTER_KERNEL(add, CPU, ANY, pten::ElementwiseAdd, @@ -140,7 +135,7 @@ PT_REGISTER_KERNEL("add", int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL("subtract", +PT_REGISTER_KERNEL(subtract, CPU, ANY, pten::ElementwiseSub, @@ -150,7 +145,7 @@ PT_REGISTER_KERNEL("subtract", int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL("divide", +PT_REGISTER_KERNEL(divide, CPU, ANY, pten::ElementwiseDiv, @@ -160,7 +155,7 @@ PT_REGISTER_KERNEL("divide", int64_t, complex64, complex128) {} -PT_REGISTER_KERNEL("multiply", +PT_REGISTER_KERNEL(multiply, CPU, ANY, pten::ElementwiseMul, @@ -171,8 +166,7 @@ PT_REGISTER_KERNEL("multiply", bool, complex64, complex128) {} - -PT_REGISTER_KERNEL("sum", +PT_REGISTER_KERNEL(sum, CPU, ANY, pten::Sum, diff --git a/paddle/pten/kernels/cpu/utils.cc b/paddle/pten/kernels/cpu/utils.cc index b462ef70c2f06e..500b4664d63888 100644 --- a/paddle/pten/kernels/cpu/utils.cc +++ b/paddle/pten/kernels/cpu/utils.cc @@ -57,7 +57,4 @@ void Copy(const CPUContext& dev_ctx, } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(UtilsCPU); - -PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CPU, ANY, pten::Copy) {} +PT_REGISTER_KERNEL_ALL_DTYPE(copy, CPU, ANY, pten::Copy) {} diff --git a/paddle/pten/kernels/cuda/creation.cu b/paddle/pten/kernels/cuda/creation.cu index 8bc23fb6af056f..dd29fd5fbb84d2 100644 --- a/paddle/pten/kernels/cuda/creation.cu +++ b/paddle/pten/kernels/cuda/creation.cu @@ -62,9 +62,7 @@ void FillConstant(const CUDAContext& dev_ctx, } // namespace pten -PT_REGISTER_MODULE(CreationCUDA); - -PT_REGISTER_KERNEL("full_like", +PT_REGISTER_KERNEL(full_like, CUDA, ANY, pten::FillAnyLike, @@ -75,7 +73,7 @@ PT_REGISTER_KERNEL("full_like", bool, paddle::platform::float16) {} -PT_REGISTER_KERNEL("full", +PT_REGISTER_KERNEL(full, CUDA, ANY, pten::FillConstant, diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu index fe2ac6f184ff73..b08ed8f71ee6b2 100644 --- a/paddle/pten/kernels/cuda/linalg.cu +++ b/paddle/pten/kernels/cuda/linalg.cu @@ -54,13 +54,11 @@ void Matmul(const CUDAContext& dev_ctx, } // namespace pten -PT_REGISTER_MODULE(LinalgCUDA); - using float16 = paddle::platform::float16; using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_KERNEL("dot", +PT_REGISTER_KERNEL(dot, CUDA, ANY, pten::Dot, @@ -71,7 +69,7 @@ PT_REGISTER_KERNEL("dot", complex64, complex128) {} -PT_REGISTER_KERNEL("matmul_v2", +PT_REGISTER_KERNEL(matmul_v2, CUDA, ANY, pten::Matmul, diff --git a/paddle/pten/kernels/cuda/manipulation.cu b/paddle/pten/kernels/cuda/manipulation.cu index acaf1ac2cc62b2..e668d1b04d7238 100644 --- a/paddle/pten/kernels/cuda/manipulation.cu +++ b/paddle/pten/kernels/cuda/manipulation.cu @@ -129,13 +129,9 @@ void Cast(const CUDAContext& dev_ctx, } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(ManipulationCUDA); - using float16 = paddle::platform::float16; -// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel -// architecture, kernel_name should be "flatten". -PT_REGISTER_KERNEL("flatten", + +PT_REGISTER_KERNEL(flatten, CUDA, ANY, pten::Flatten, @@ -146,8 +142,7 @@ PT_REGISTER_KERNEL("flatten", int8_t, int, int64_t) {} - -PT_REGISTER_KERNEL("flatten.mid", +PT_REGISTER_KERNEL(flatten_mid, CUDA, ANY, pten::FlattenWithXShape, @@ -159,7 +154,7 @@ PT_REGISTER_KERNEL("flatten.mid", int64_t) {} #define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \ - PT_REGISTER_KERNEL("cast", \ + PT_REGISTER_KERNEL(cast, \ CUDA, \ ANY, \ pten::Cast, \ @@ -184,44 +179,33 @@ PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16) PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast) #endif -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape", - CUDA, - ANY, - pten::ReshapeFromVectorVal) {} - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mid", - CUDA, - ANY, - pten::ReshapeFromVectorValWithXShape) {} - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host", - CUDA, - ANY, - pten::ReshapeFromDT) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape, CUDA, ANY, pten::ReshapeFromVectorVal) {} +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mid, + CUDA, + ANY, + pten::ReshapeFromVectorValWithXShape) {} +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host, CUDA, ANY, pten::ReshapeFromDT) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.host.mid", - CUDA, - ANY, - pten::ReshapeFromDTWithXShape) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_host_mid, + CUDA, + ANY, + pten::ReshapeFromDTWithXShape) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost", - CUDA, - ANY, - pten::ReshapeFromVectorDT) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost, + CUDA, + ANY, + pten::ReshapeFromVectorDT) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } - -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape.mulhost.mid", - CUDA, - ANY, - pten::ReshapeFromVectorDTWithXShape) { +PT_REGISTER_KERNEL_ALL_DTYPE(reshape_mulhost_mid, + CUDA, + ANY, + pten::ReshapeFromVectorDTWithXShape) { kernel->InputAt(1).SetBackend(pten::Backend::CPU); kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32); } diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu index 388d42942c10a4..66aaf14dcd0f62 100644 --- a/paddle/pten/kernels/cuda/math.cu +++ b/paddle/pten/kernels/cuda/math.cu @@ -111,16 +111,13 @@ void Sum(const CUDAContext& dev_ctx, } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(MathCUDA); - using float16 = paddle::platform::float16; using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_KERNEL("sign", CUDA, ANY, pten::Sign, float, double, float16) {} -PT_REGISTER_KERNEL("mean", CUDA, ANY, pten::Mean, float, double, bool) {} -PT_REGISTER_KERNEL("scale", +PT_REGISTER_KERNEL(sign, CUDA, ANY, pten::Sign, float, double, float16) {} +PT_REGISTER_KERNEL(mean, CUDA, ANY, pten::Mean, float, double, bool) {} +PT_REGISTER_KERNEL(scale, CUDA, ANY, pten::Scale, @@ -132,7 +129,7 @@ PT_REGISTER_KERNEL("scale", int16_t, int, int64_t) {} -PT_REGISTER_KERNEL("add", +PT_REGISTER_KERNEL(add, CUDA, ANY, pten::ElementwiseAdd, @@ -143,7 +140,7 @@ PT_REGISTER_KERNEL("add", float16, complex64, complex128) {} -PT_REGISTER_KERNEL("subtract", +PT_REGISTER_KERNEL(subtract, CUDA, ANY, pten::ElementwiseSub, @@ -154,7 +151,7 @@ PT_REGISTER_KERNEL("subtract", float16, complex64, complex128) {} -PT_REGISTER_KERNEL("divide", +PT_REGISTER_KERNEL(divide, CUDA, ANY, pten::ElementwiseDiv, @@ -165,7 +162,7 @@ PT_REGISTER_KERNEL("divide", float16, complex64, complex128) {} -PT_REGISTER_KERNEL("multiply", +PT_REGISTER_KERNEL(multiply, CUDA, ANY, pten::ElementwiseMul, @@ -177,7 +174,7 @@ PT_REGISTER_KERNEL("multiply", float16, complex64, complex128) {} -PT_REGISTER_KERNEL("sum", +PT_REGISTER_KERNEL(sum, CUDA, ANY, pten::Sum, diff --git a/paddle/pten/kernels/cuda/utils.cu b/paddle/pten/kernels/cuda/utils.cu index 24da650d1f3eb9..49027e956b2d7d 100644 --- a/paddle/pten/kernels/cuda/utils.cu +++ b/paddle/pten/kernels/cuda/utils.cu @@ -234,7 +234,4 @@ void Copy(const CUDAContext& dev_ctx, } } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(UtilsCUDA); - -PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", CUDA, ANY, pten::Copy) {} +PT_REGISTER_KERNEL_ALL_DTYPE(copy, CUDA, ANY, pten::Copy) {} diff --git a/paddle/pten/kernels/xpu/manipulation.cc b/paddle/pten/kernels/xpu/manipulation.cc index 5f1c0d42eb5a8f..f361933cad45a5 100644 --- a/paddle/pten/kernels/xpu/manipulation.cc +++ b/paddle/pten/kernels/xpu/manipulation.cc @@ -95,12 +95,7 @@ void ReshapeFromVectorDT(const XPUContext& dev_ctx, } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(ManipulationXPU); - -// TODO(yuanrisheng): "flatten_contiguous_range" is compatible with old kernel -// architecture, kernel_name should be "flatten". -PT_REGISTER_KERNEL("flatten_contiguous_range", +PT_REGISTER_KERNEL(flatten, XPU, ANY, pten::Flatten, @@ -112,7 +107,7 @@ PT_REGISTER_KERNEL("flatten_contiguous_range", int, int64_t) {} -PT_REGISTER_KERNEL("flatten_contiguous_range.mid", +PT_REGISTER_KERNEL(flatten_mid, XPU, ANY, pten::FlattenWithXShape, @@ -124,9 +119,4 @@ PT_REGISTER_KERNEL("flatten_contiguous_range.mid", int, int64_t) {} -// TODO(yuanrisheng): "reshape2" is compatible with old kernel -// architecture, kernel_name should be "reshape". -PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2", - XPU, - ANY, - pten::ReshapeFromVectorVal) {} +PT_REGISTER_KERNEL_ALL_DTYPE(reshape, XPU, ANY, pten::ReshapeFromVectorVal) {} diff --git a/paddle/pten/kernels/xpu/utils.cc b/paddle/pten/kernels/xpu/utils.cc index 329dc2baf87b58..5c98217f4ec2c4 100644 --- a/paddle/pten/kernels/xpu/utils.cc +++ b/paddle/pten/kernels/xpu/utils.cc @@ -76,7 +76,4 @@ void Copy(const XPUDeviceContext& dev_ctx, } // namespace pten -// TODO(chenweihang): replace by better impl -PT_REGISTER_MODULE(UtilsXPU); - -PT_REGISTER_KERNEL_WITH_NO_TYPE("copy", XPU, ANY, pten::Copy) {} +PT_REGISTER_KERNEL_ALL_DTYPE(copy, XPU, ANY, pten::Copy) {} diff --git a/paddle/pten/tests/api/test_reshape_api.cc b/paddle/pten/tests/api/test_reshape_api.cc index b6179f11b1019e..227dcc6e9568d7 100644 --- a/paddle/pten/tests/api/test_reshape_api.cc +++ b/paddle/pten/tests/api/test_reshape_api.cc @@ -21,12 +21,6 @@ limitations under the License. */ #include "paddle/pten/core/dense_tensor.h" #include "paddle/pten/core/kernel_registry.h" -PT_DECLARE_MODULE(ManipulationCPU); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_DECLARE_MODULE(ManipulationCUDA); -#endif - namespace paddle { namespace tests { diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index ba809dceee191b..6ffdd75f72b4ec 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -230,6 +230,8 @@ from .tensor.math import lerp # noqa: F401 from .tensor.math import rad2deg # noqa: F401 from .tensor.math import deg2rad # noqa: F401 +from .tensor.math import gcd # noqa: F401 +from .tensor.math import lcm # noqa: F401 from .tensor.math import diff # noqa: F401 from .tensor.math import angle # noqa: F401 @@ -483,6 +485,8 @@ 'atan2', 'rad2deg', 'deg2rad', + 'gcd', + 'lcm', 'expand', 'broadcast_to', 'ones_like', diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py index ffd24add50a4d0..dc313c33ee3e28 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py @@ -27,11 +27,13 @@ import paddle import paddle.fluid as fluid from paddle import framework +from paddle.fluid import core import paddle.distributed as dist from paddle.optimizer import Optimizer +from paddle.fluid.clip import ClipGradByGlobalNorm from ...utils.internal_storage import ParamStorage -from ...meta_parallel.sharding.sharding_utils import Type +from ...meta_parallel.sharding.sharding_utils import Type, device_guard, ShardingClipGrad # CUDA alignment 256 bytes alignment = {"gpu": 256, } @@ -99,16 +101,41 @@ def __init__(self, self.broadcast_fp16 = broadcast_fp16 self.param_storages = {} # {dtype: {rank: InternalStorage}} + + if isinstance(self._optim._grad_clip, ClipGradByGlobalNorm): + logging.warning( + "While using ClipGradByGlobalNorm in ShardingOptimizer, the grad clip of original optimizer will be changed." + ) + self._optim._grad_clip = ShardingClipGrad(self._optim._grad_clip, + group, + paddle.get_device()) + + if offload: + assert self._pfp16, "Only support offload strategy while using \'Adam\', \'AdamW\' and \'Momentum\' optimizer with AMP/Pure FP16" + self.offload = offload # Using for offload + self.offload_device = "cpu" + + self._master_params = {} # Update optimizer parameters and adjust parameter storage and use according to rank. self.update_opt_status() def _generate_master_params(self, trainable_params): - for param in trainable_params: - if param.dtype == Type.fp16.value: - self._optim._master_weights[param.name] = paddle.cast( - param, Type.fp32.value) + if self.offload: + for param in trainable_params: + if param.name not in self._master_params.keys(): + self._master_params[param.name] = core.VarBase( + name=param.name, + value=param.cast(dtype=Type.fp32.value).numpy(), + place=core.CPUPlace(), + stop_gradient=param.stop_gradient) + self._optim._master_weights = self._master_params + else: + for param in trainable_params: + if param.dtype == Type.fp16.value: + self._optim._master_weights[param.name] = paddle.cast( + param, Type.fp32.value) def update_opt_status(self): """Update optimizer status and parameter storage information, and special functions to be developed. @@ -243,22 +270,43 @@ def step(self): A wrapper for Optimizer's step function to finish the update operation of the optimizer. """ - # Synchronize optimizer parameters for the current rank - if len(self.dtype_rank_params.keys( - )) == 1 and Type.fp32.value in self.dtype_rank_params.keys(): - self._optim._parameter_list = self.dtype_rank_params[ - Type.fp32.value][self.rank] - elif len(self.dtype_rank_params.keys( - )) == 1 and Type.fp16.value in self.dtype_rank_params.keys(): - self._optim._parameter_list = self.dtype_rank_params[ - Type.fp16.value][self.rank] + if self.offload: + self._optim._parameter_list = [ + param for name, param in self._master_params.items() + ] else: - self._optim._parameter_list = self.dtype_rank_params[ - Type.fp16.value][self.rank] + self.dtype_rank_params[ + # Synchronize optimizer parameters for the current rank + if len(self.dtype_rank_params.keys( + )) == 1 and Type.fp32.value in self.dtype_rank_params.keys(): + self._optim._parameter_list = self.dtype_rank_params[ Type.fp32.value][self.rank] + elif len(self.dtype_rank_params.keys( + )) == 1 and Type.fp16.value in self.dtype_rank_params.keys(): + self._optim._parameter_list = self.dtype_rank_params[ + Type.fp16.value][self.rank] + else: + self._optim._parameter_list = self.dtype_rank_params[ + Type.fp16.value][self.rank] + self.dtype_rank_params[ + Type.fp32.value][self.rank] # Run the optimizer of the current rank step - self._optim.step() + if self.offload: + with device_guard(self.rank, self.offload_device): + self._optim.step() + + for param in self._optim._parameter_list: + self._master_params[param.name].set_value(param) + + dev_id = 0 if paddle.get_device() == "cpu" else int( + paddle.get_device().split(":")[1]) + + for param in self._local_params: + if param.name in self._master_params.keys(): + param.set_value(self._master_params[param.name].cuda(dev_id) + .cast(dtype=param.dtype)) + self._master_params[param.name].clear_gradient(False) + else: + self._optim.step() # Synchronize all the updated shards in between the ranks self._broadcast_params() diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py index 37b85751149f71..fd49c2a7d65869 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py @@ -112,6 +112,18 @@ def __init__( self._has_grad_storage = [] self._grad_storage_list = [] + # offload + # TODO(haohongxiang): Now it's not supported for multi-optimizers using Offload strategy + self._offload_optims = list( + filter(lambda optim: optim.offload, self._sharding_optimizers)) + if len(self._offload_optims) > 0: + assert len( + self._sharding_optimizers + ) == 1, "Only support offload strategy for single optimizer" + + self._offload = self._sharding_optimizers[0].offload + self._offload_device = "cpu" + # Set backward pass hooks self._bw_hooks = [] @@ -156,7 +168,8 @@ def clear_gradients(self): # Release grad storages for dtype in self._grad_storages.keys(): if self._rank in self._grad_storages[dtype].keys(): - self._grad_storages[dtype][self._rank].buffer.zero_() + if not self._offload: + self._grad_storages[dtype][self._rank].buffer.zero_() # Release params for param in self._trainable_params: @@ -167,17 +180,24 @@ def grad_scale(self): """ Before the gradient accumulation, scale the gradient. """ - # Scale grad storages - for dtype in self._grad_storages.keys(): - if self._rank in self._grad_storages[dtype].keys(): - self._grad_storages[dtype][self._rank].buffer.scale_( - scale=self._world_size_scaling) - - # Scale params - for param in self._trainable_params: - if param.name in self._param_grads and param.grad is not None: - param.grad.scale_(scale=self._world_size_scaling) - param._reset_grad_inplace_version(True) + if self._offload: + for param in self._trainable_params: + if param.name in self._sharding_optimizers[ + 0]._master_params.keys(): + self._sharding_optimizers[0]._master_params[ + param.name].grad.scale_(scale=self._world_size_scaling) + else: + # Scale grad storages + for dtype in self._grad_storages.keys(): + if self._rank in self._grad_storages[dtype].keys(): + self._grad_storages[dtype][self._rank].buffer.scale_( + scale=self._world_size_scaling) + + # Scale params + for param in self._trainable_params: + if param.name in self._param_grads and param.grad is not None: + param.grad.scale_(scale=self._world_size_scaling) + param._reset_grad_inplace_version(True) def _init_internal_storage(self, needs_fresh): """ @@ -195,8 +215,14 @@ def to(self, device=None, dtype=None, blocking=True): """ Synchronously or asynchronously convert the data type of the layer, the device is not supported now. """ + assert isinstance(device, str), "Device must be type str" assert device == self._default_device, "New devices are not supported, because of the optimizer state is not sync" + self._layer.to(device=device, dtype=dtype, blocking=blocking) + + # Re-build the buckets, hooks, etc.. + self._fresh_trainable() + def _fresh_trainable(self): """ Whether to update training parameters. """ @@ -283,12 +309,17 @@ def reduce(*_): self._grad_reduced[index] = False if not self._accumulate_grads: param.grad.scale_(scale=self._world_size_scaling) - param._reset_grad_inplace_version(True) + param._reset_grad_inplace_version(True) # Clear the gradient that does not belong to the current rank through the callback function def cleanup(): if dst_rank != self._rank: param.clear_gradient(False) + elif self._offload: + self._sharding_optimizers[0]._master_params[ + param.name]._copy_gradient_from(param.grad.cpu( + ).cast(dtype=Type.fp32.value)) + param.clear_gradient(False) # Synchronize the reduce parameter gradient self._tasks_flow.append( @@ -339,6 +370,15 @@ def cleanup(): grad_storage.buffer.value().get_tensor()._clear( ) + elif self._offload: + grad_storage.to(device=self._offload_device) + for param in grad_storage._params: + self._sharding_optimizers[0]._master_params[ + param.name]._copy_gradient_from( + param.grad.cast( + dtype=Type.fp32.value)) + grad_storage.buffer.value().get_tensor()._clear( + ) # Reduce the bucket grad_storage.sent = True @@ -478,7 +518,7 @@ def _build_grad_storages(self): # Rebuild fp16/fp32 grad storages for dtype in self._grad_storages.keys(): for dst_rank, grad_storage in self._grad_storages[dtype].items(): - if dst_rank != self._rank: + if self._offload or dst_rank != self._rank: grad_storage.manumal_relase() grad_storage.rebuild() diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py index d4c443e385f6fe..651bed82396d1d 100644 --- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py +++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py @@ -17,10 +17,17 @@ from collections import abc from enum import Enum from math import inf +import numpy as np +from types import MethodType import paddle import paddle.distributed as dist +from paddle import _C_ops from paddle.fluid import core +from paddle.fluid import layers +from paddle.fluid.dygraph import to_variable +from paddle.fluid.framework import dygraph_only +from paddle.fluid.dygraph import base as imperative_base class Taskflow: @@ -41,6 +48,88 @@ class Type(Enum): fp32 = paddle.float32 +class ShardingClipGrad: + def __init__(self, clip, group, device): + self._clip = clip + self._group = group + self._device = device + + @imperative_base.no_grad + def _dygraph_clip(self, params_grads): + params_and_grads = [] + + sum_square_fp16 = [] + sum_square_fp32 = [] + + for p, g in params_grads: + if g is None or getattr(p, 'need_clip', True) is False: + continue + + merge_grad = g + if g.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.get_tensor_from_selected_rows( + layers.merge_selected_rows(g)) + square = layers.square(merge_grad) + sum_square = layers.reduce_sum(square) + + if p.dtype == paddle.float16: + sum_square_fp16.append(sum_square) + elif p.dtype == paddle.float32: + sum_square_fp32.append(sum_square) + + # global norm of non-distributed FP16 params_and_grads + if len(sum_square_fp16) == 0: + global_norm_fp16 = paddle.to_tensor([0.], dtype=paddle.float32) + else: + global_norm_fp16 = layers.concat(sum_square_fp16) + global_norm_fp16 = layers.reduce_sum(global_norm_fp16) + global_norm_fp16 = paddle.cast( + global_norm_fp16, dtype=paddle.float32) + + # global norm of non-distributed FP32 params_and_grads + global_norm_fp32 = layers.concat(sum_square_fp32) if len( + sum_square_fp32) != 0 else paddle.to_tensor( + [0.], dtype=paddle.float32) + global_norm_fp32 = layers.reduce_sum(global_norm_fp32) + + global_norm_var = global_norm_fp16 + global_norm_fp32 + + # add all reduce to get global norm of distributed params_and_grads + dev_id = int(self._device.split(":")[1]) + with device_guard(dev_id, "gpu"): + paddle.distributed.all_reduce(global_norm_var, group=self._group) + + global_norm_var = layers.sqrt(global_norm_var) + max_global_norm = layers.fill_constant( + shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) + + clip_var = layers.elementwise_div( + x=max_global_norm, + y=layers.elementwise_max( + x=global_norm_var, y=max_global_norm)) + clip_var_fp16 = paddle.cast(clip_var, paddle.float16) + + for p, g in params_grads: + if g is None: + continue + if getattr(p, 'need_clip', True) is False: + params_and_grads.append((p, g)) + continue + if p.dtype == paddle.float16: + new_grad = layers.elementwise_mul(x=g, y=clip_var_fp16) + else: + new_grad = layers.elementwise_mul(x=g, y=clip_var) + params_and_grads.append((p, new_grad)) + + return params_and_grads + + def __getattr__(self, item): + return getattr(self._clip, item) + + def __call__(self, params_grads): + return self._dygraph_clip(params_grads) + + @contextlib.contextmanager def device_guard(dev_id, device="cpu"): origin_device = paddle.device.get_device() @@ -52,3 +141,65 @@ def device_guard(dev_id, device="cpu"): yield finally: paddle.set_device(origin_device) + + +@dygraph_only +def ShardingScaler(scaler, sharding_group): + def unscale_method(self, optimizer): + if not self._enable: + return + param_grads = [] + param_grads_fp16 = [] + param_grads_fp32 = [] + + if getattr(optimizer, '_param_groups', None) and isinstance( + optimizer._param_groups[0], dict): + + for group in optimizer._param_groups: + for param in group['params']: + if param._grad_ivar() is not None: + param_grads.append(param._grad_ivar()) + if param._grad_ivar( + ).dtype == core.VarDesc.VarType.FP16: + param_grads_fp16.append(param._grad_ivar()) + else: + param_grads_fp32.append(param._grad_ivar()) + else: + param_grads = [ + param._grad_ivar() for param in optimizer._parameter_list + if param._grad_ivar() is not None + ] + param_grads_fp16 = [ + param._grad_ivar() for param in optimizer._parameter_list + if (param._grad_ivar() is not None + ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP16 + ) + ] + param_grads_fp32 = [ + param._grad_ivar() for param in optimizer._parameter_list + if (param._grad_ivar() is not None + ) and (param._grad_ivar().dtype == core.VarDesc.VarType.FP32 + ) + ] + temp_found_inf_fp16 = to_variable(np.array([0]).astype(np.bool)) + temp_found_inf_fp32 = to_variable(np.array([0]).astype(np.bool)) + if len(param_grads_fp16): + _C_ops.check_finite_and_unscale(param_grads_fp16, self._scale, + param_grads_fp16, + temp_found_inf_fp16) + if len(param_grads_fp32): + _C_ops.check_finite_and_unscale(param_grads_fp32, self._scale, + param_grads_fp32, + temp_found_inf_fp32) + + self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0 + is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32") + + paddle.distributed.all_reduce( + is_found_inf, + op=paddle.distributed.ReduceOp.MAX, + group=sharding_group) + self._found_inf = is_found_inf.numpy()[0] + + scaler._unscale = MethodType(unscale_method, scaler) + return scaler diff --git a/python/paddle/distributed/fleet/utils/internal_storage.py b/python/paddle/distributed/fleet/utils/internal_storage.py index ff41ca217e43bb..f44b57ede468b2 100644 --- a/python/paddle/distributed/fleet/utils/internal_storage.py +++ b/python/paddle/distributed/fleet/utils/internal_storage.py @@ -50,6 +50,29 @@ def __init__(self, size, dtype, device, convert_cpu=False): else: self.buffer = paddle.zeros(size, dtype=dtype) + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + assert self.buffer is not None, "Cannot move a collapsed bucket, please rebuild it" + assert (dtype == Type.fp32.value or + Type.fp16.value), "Conversion type is not supported now" + + dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() + .split(":")[1]) + + if self._device != device: + tmp_buffer = self.buffer.cuda( + dev_id) if device == "gpu" else self.buffer.cpu() + for param in self._params: + param.clear_gradient(False) + param._gradient_set_empty(False) + self.buffer.value().get_tensor()._clear() + self.buffer = tmp_buffer + + if dtype is not None: + self.buffer = self.buffer.cast(dtype=dtype) + class ParamStorage(InternalStorage): """ @@ -60,6 +83,16 @@ def __init__(self, size, dtype, device): super().__init__(size, dtype, device, convert_cpu=True) self.param2align = None + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + + super().to(device, dtype) + + if keep_alignment: + self._array_params() + @fluid.dygraph.no_grad def add_rank_params(self, trainable_params, param2align): """ @@ -78,7 +111,7 @@ def add_rank_params(self, trainable_params, param2align): p_shape = self._add_param_as_view(param, param2align[param.name]) cpu_param_shape.append(p_shape) - # buffer covert from cpu to cuda + # buffer convert from cpu to cuda dev_id = int(paddle.get_device().split(":")[1]) self.buffer = self.buffer.cuda(dev_id) self._fill = 0 @@ -109,7 +142,8 @@ def _add_param_as_view(self, param, align): param.stop_gradient = origin_state # Copy the current param value - dev_id = int(paddle.get_device().split(":")[1]) + dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() + .split(":")[1]) with device_guard(dev_id, "cpu"): tmp_var = core.VarBase(tensor=self.buffer._slice(self._fill, var_end)) @@ -134,6 +168,18 @@ def _convert_buffer(self, param, p_shape, align): self._fill = offset + @fluid.dygraph.no_grad + def _array_params(self): + """ + Given the parameters which have been registered previously, rebuild the whole InternalStorage. + """ + assert len(self._params) > 0 + assert self.param2align is not None + + self._fill = 0 + for p in self._params: + self._convert_buffer(p, p.shape, self.param2align[p.name]) # modify + class GradStorage(InternalStorage): """ @@ -171,6 +217,18 @@ def can_add_grad_view(self, param, align): param.shape) + align <= self._max_size and id( param) not in self._param_ids + def to(self, device, dtype=None, keep_alignment=True): + """ + Move the underlying buffer + """ + if self._release: + self.rebuild() + + super().to(device, dtype) + + if keep_alignment: + self._array_grads() + @fluid.dygraph.no_grad def add_grad(self, param, align): """ @@ -206,17 +264,25 @@ def rebuild(self): """ Given the parameter gradients which have been registered previously, rebuild the whole InternalStorage. """ - assert len(self._params) > 0 if self._release: - self.buffer = paddle.zeros( - [self._max_size], dtype=self._params[0].dtype) + self.buffer = paddle.zeros([self._max_size], dtype=self._dtype) for p in self._params: self._add_grad_as_view(p, self._parm2align[p.name]) self._release = False + @fluid.dygraph.no_grad + def _array_grads(self): + """ + Given the parameters gradients which have been registered previously, rebuild the whole InternalStorage. + """ + if len(self._params) > 0: + self._fill = 0 + for p in self._params: + self._add_grad_as_view(p, self._parm2align[p.name]) + @fluid.dygraph.no_grad def _add_grad_as_view(self, param, align): assert np.prod( @@ -229,8 +295,17 @@ def _add_grad_as_view(self, param, align): assert offset <= np.prod(self.buffer.shape) # Copy the current grad value to InternalStorage - assert self._device == "gpu" - tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end)) - param._copy_gradient_from(tmp_var) - tmp_var.value().get_tensor()._clear() + dev_id = 0 if paddle.get_device() == "cpu" else int(paddle.get_device() + .split(":")[1]) + if self._device == "cpu": + with device_guard(dev_id, self._device): + tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end)) + param._copy_gradient_from(tmp_var) + tmp_var.value().get_tensor()._clear() + + elif self._device == "gpu": + tmp_var = core.VarBase(self.buffer._slice(self._fill, grad_end)) + param._copy_gradient_from(tmp_var) + tmp_var.value().get_tensor()._clear() + self._fill = offset diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py index e30d3e4c20a92a..cf198eab1e8e02 100644 --- a/python/paddle/distribution.py +++ b/python/paddle/distribution.py @@ -305,7 +305,8 @@ def sample(self, shape, seed=0): else: output_shape = shape + batch_shape output = nn.uniform_random( - output_shape, seed=seed, dtype=self.dtype) * (tensor.zeros( + output_shape, dtype=self.dtype, min=0., max=1., + seed=seed) * (tensor.zeros( output_shape, dtype=self.dtype) + (self.high - self.low)) output = elementwise_add(output, self.low, name=name) if self.all_arg_is_float: diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py index 125d9fa88d4aed..5d29dc522b3ef6 100644 --- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py +++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py @@ -484,7 +484,7 @@ def save_quantized_model(self, model, path, input_spec=None, **config): model_filename=model_filename, params_filename=params_filename)) - self._gather_scales(infer_program, scope) + self._gather_scales(infer_program, scope, fetch_targets) self._set_skip_quant_attr(infer_program) @@ -520,10 +520,10 @@ def _is_target_layer(self, layer): return flag - def _gather_scales(self, program, scope): + def _gather_scales(self, program, scope, fetch_targets): """ Get all scales from fake ops, save them into the corresponding ops - and delete all moving_average_abs_max_scale ops. + and delete all moving_average_abs_max_scale ops. """ def _gather_input_scale(): @@ -580,6 +580,11 @@ def _gather_output_scale(): for next_op in next_ops: next_op._rename_input(out_var_name, in_var_name) + # If next_op is `fetch` and out_var_name in fetch_targets, + # fetch_targets must update to in_var_name when rename input. + for i in range(len(fetch_targets)): + if fetch_targets[i].name == out_var_name: + fetch_targets[i] = block.var(in_var_name) _gather_input_scale() _gather_output_scale() diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 3e033f70aca38a..1ddb9c8e5fa9f4 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -410,6 +410,23 @@ def quantize(self): for op_type in self._dynamic_quantize_op_type): self._collect_dynamic_quantize_op_threshold( self._dynamic_quantize_op_type) + + # Move sub blocks persistable var to global block + global_block = self._program.global_block() + for _op in global_block.ops: + if _op.type == "while": + _block_id = _op.attr("sub_block").id + _block = self._program.block(_block_id) + persistables = [] + for _name, _var in _block.vars.items(): + if _var.persistable: + global_block._clone_variable(_var) + persistables.append(_name) + for _name in persistables: + _block._remove_var(_name) + persistables.extend(_op.input('X')) + _op.desc.set_input("X", persistables) + return self._program def save_quantized_model(self, @@ -451,10 +468,6 @@ def _load_model_data(self): model_filename=self._model_filename, params_filename=self._params_filename) - if self._program.num_blocks > 1: - _logger.error("The post training quantization requires that the " - "program only has one block.") - if self._optimize_model: self._optimize_fp32_model() @@ -505,23 +518,26 @@ def collect_var_name(var_name_list, persistable_var_names, op_type): self._quantized_act_var_name.add(var_name) persistable_var_names = _all_persistable_var_names(self._program) - for op in self._program.global_block().ops: - op_type = op.type - if self._is_full_quantize and \ - op_type not in self._quantizable_op_type: - _logger.warning(op_type + " is not supported for quantization.") - # For quantized ops, sample inputs and outputs - if op_type in self._quantizable_op_type: - collect_var_name( - _get_op_input_var_names(op), persistable_var_names, op_type) - collect_var_name( - _get_op_output_var_names(op), persistable_var_names, - op_type) - # For other op, only sample output scale - elif op_type in self._out_scale_op_list: - collect_var_name( - _get_op_output_var_names(op), persistable_var_names, - op_type) + for block_id in range(len(self._program.blocks)): + for op in self._program.blocks[block_id].ops: + op_type = op.type + if self._is_full_quantize and \ + op_type not in self._quantizable_op_type: + _logger.warning(op_type + + " is not supported for quantization.") + # For quantized ops, sample inputs and outputs + if op_type in self._quantizable_op_type: + collect_var_name( + _get_op_input_var_names(op), persistable_var_names, + op_type) + collect_var_name( + _get_op_output_var_names(op), persistable_var_names, + op_type) + # For other op, only sample output scale + elif op_type in self._out_scale_op_list: + collect_var_name( + _get_op_output_var_names(op), persistable_var_names, + op_type) def _set_activation_persistable(self): ''' @@ -696,16 +712,17 @@ def _save_input_threhold(self): ''' assert self._algo == "min_max", \ "The algo should be min_max to save input threshold." - for op in self._program.global_block().ops: - if op.type in self._quantizable_op_type: - for var_name in _get_op_input_var_names(op): - assert var_name in self._quantized_var_min - assert var_name in self._quantized_var_max - op._set_attr(var_name + ".min", - self._quantized_var_min[var_name]) - op._set_attr(var_name + ".max", - self._quantized_var_max[var_name]) - op._set_attr("with_quant_attr", True) + for block_id in range(len(self._program.blocks)): + for op in self._program.blocks[block_id].ops: + if op.type in self._quantizable_op_type: + for var_name in _get_op_input_var_names(op): + assert var_name in self._quantized_var_min + assert var_name in self._quantized_var_max + op._set_attr(var_name + ".min", + self._quantized_var_min[var_name]) + op._set_attr(var_name + ".max", + self._quantized_var_max[var_name]) + op._set_attr("with_quant_attr", True) def _collect_activation_abs_min_max(self): ''' @@ -795,7 +812,12 @@ def _update_program(self): activation_quantize_type=self._activation_quantize_type, weight_quantize_type=self._weight_quantize_type, quantizable_op_type=major_quantizable_op_types) - transform_pass.apply(graph) + + for sub_graph in graph.all_sub_graphs(): + # Insert fake_quant/fake_dequantize op must in test graph, so + # set per graph's _for_test is True. + sub_graph._for_test = True + transform_pass.apply(sub_graph) # use AddQuantDequantPass to insert fake_quant_dequant op minor_quantizable_op_types = [] @@ -806,7 +828,10 @@ def _update_program(self): scope=self._scope, place=self._place, quantizable_op_type=minor_quantizable_op_types) - add_quant_dequant_pass.apply(graph) + + for sub_graph in graph.all_sub_graphs(): + sub_graph._for_test = True + add_quant_dequant_pass.apply(sub_graph) # save threshold to scale var node if self._algo in ["KL", "hist"]: @@ -836,7 +861,11 @@ def _update_program(self): activation_bits=self._activation_bits, weight_quantize_type=self._weight_quantize_type, quantizable_op_type=major_quantizable_op_types) - freeze_pass.apply(graph) + + for sub_graph in graph.all_sub_graphs(): + sub_graph._for_test = True + freeze_pass.apply(sub_graph) + self._program = graph.to_program() def _save_output_threshold(self): @@ -888,13 +917,15 @@ def analysis_and_save_info(op_node, out_var_name): save_info(op_node, out_var_name, self._quantized_var_max, "out_max", "post_min_max") - for op in self._program.global_block().ops: - if op.type in (self._quantizable_op_type + self._out_scale_op_list): - out_var_names = _get_op_output_var_names(op) - assert len(out_var_names) == 1, "Post training " + \ - "quantization only support one output for " + op.type - for var_name in out_var_names: - analysis_and_save_info(op, var_name) + for block_id in range(len(self._program.blocks)): + for op in self._program.blocks[block_id].ops: + if op.type in ( + self._quantizable_op_type + self._out_scale_op_list): + out_var_names = _get_op_output_var_names(op) + assert len(out_var_names) == 1, "Post training " + \ + "quantization only support one output for " + op.type + for var_name in out_var_names: + analysis_and_save_info(op, var_name) def _collect_dynamic_quantize_op_threshold(self, target_ops_type): """ diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt index 94d7a2ed153488..494ea969797197 100644 --- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt +++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt @@ -139,6 +139,7 @@ endfunction() if(WIN32) list(REMOVE_ITEM TEST_OPS test_light_nas) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mnist) + list(REMOVE_ITEM TEST_OPS test_post_training_quantization_while) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_mobilenetv1) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_resnet50) list(REMOVE_ITEM TEST_OPS test_post_training_quantization_lstm_model) @@ -336,6 +337,7 @@ if(NOT WIN32) set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY") set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 600 LABELS "RUN_TYPE=NIGHTLY") set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120) + set_tests_properties(test_post_training_quantization_while PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_ptq PROPERTIES TIMEOUT 120) set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py new file mode 100644 index 00000000000000..3c3dfd08fccfa3 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_while.py @@ -0,0 +1,313 @@ +# copyright (c) 2021 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. +import unittest +import os +import time +import sys +import random +import math +import functools +import contextlib +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.dataset.common import download +from paddle.fluid.contrib.slim.quantization import PostTrainingQuantization + +paddle.enable_static() + +random.seed(0) +np.random.seed(0) + + +class TestPostTrainingQuantization(unittest.TestCase): + def setUp(self): + self.download_path = 'int8/download' + self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' + + self.download_path) + self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) + self.int8_model_path = os.path.join(os.getcwd(), + "post_training_" + self.timestamp) + try: + os.system("mkdir -p " + self.int8_model_path) + except Exception as e: + print("Failed to create {} due to {}".format(self.int8_model_path, + str(e))) + sys.exit(-1) + + def tearDown(self): + try: + os.system("rm -rf {}".format(self.int8_model_path)) + except Exception as e: + print("Failed to delete {} due to {}".format(self.int8_model_path, + str(e))) + + def cache_unzipping(self, target_folder, zip_path): + cmd = 'tar xf {0} -C {1}'.format(zip_path, target_folder) + os.system(cmd) + + def download_model(self, data_url, data_md5, folder_name): + download(data_url, self.download_path, data_md5) + file_name = data_url.split('/')[-1] + zip_path = os.path.join(self.cache_folder, file_name) + print('Data is downloaded at {0}'.format(zip_path)) + + data_cache_folder = os.path.join(self.cache_folder, folder_name) + self.cache_unzipping(self.cache_folder, zip_path) + return data_cache_folder + + def run_program(self, model_path, batch_size, infer_iterations): + print("test model path:" + model_path) + place = fluid.CPUPlace() + exe = fluid.Executor(place) + [infer_program, feed_dict, fetch_targets] = \ + fluid.io.load_inference_model(model_path, + model_filename='model.pdmodel', + params_filename='model.pdiparams', executor=exe) + val_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size) + + img_shape = [1, 28, 28] + test_info = [] + cnt = 0 + periods = [] + for batch_id, data in enumerate(val_reader()): + image = np.array( + [x[0].reshape(img_shape) for x in data]).astype("float32") + input_label = np.array([x[1] for x in data]).astype("int64") + + t1 = time.time() + out = exe.run(infer_program, + feed={feed_dict[0]: image}, + fetch_list=fetch_targets) + t2 = time.time() + period = t2 - t1 + periods.append(period) + + out_label = np.argmax(np.array(out[0]), axis=1) + top1_num = sum(input_label == out_label) + test_info.append(top1_num) + cnt += len(data) + + if (batch_id + 1) == infer_iterations: + break + + throughput = cnt / np.sum(periods) + latency = np.average(periods) + acc1 = np.sum(test_info) / cnt + return (throughput, latency, acc1) + + def generate_quantized_model(self, + model_path, + algo="KL", + quantizable_op_type=["conv2d"], + is_full_quantize=False, + is_use_cache_file=False, + is_optimize_model=False, + batch_size=10, + batch_nums=10): + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + scope = fluid.global_scope() + val_reader = paddle.dataset.mnist.train() + + ptq = PostTrainingQuantization( + executor=exe, + model_dir=model_path, + model_filename='model.pdmodel', + params_filename='model.pdiparams', + sample_generator=val_reader, + batch_size=batch_size, + batch_nums=batch_nums, + algo=algo, + quantizable_op_type=quantizable_op_type, + is_full_quantize=is_full_quantize, + optimize_model=is_optimize_model, + is_use_cache_file=is_use_cache_file) + ptq.quantize() + ptq.save_quantized_model( + self.int8_model_path, + model_filename='model.pdmodel', + params_filename='model.pdiparams') + + def run_test(self, + model_name, + data_url, + data_md5, + algo, + quantizable_op_type, + is_full_quantize, + is_use_cache_file, + is_optimize_model, + diff_threshold, + batch_size=10, + infer_iterations=10, + quant_iterations=5): + + origin_model_path = self.download_model(data_url, data_md5, model_name) + #origin_model_path = os.path.join(origin_model_path, model_name) + + print("Start FP32 inference for {0} on {1} images ...".format( + model_name, infer_iterations * batch_size)) + (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program( + origin_model_path, batch_size, infer_iterations) + + print("Start INT8 post training quantization for {0} on {1} images ...". + format(model_name, quant_iterations * batch_size)) + self.generate_quantized_model( + origin_model_path, algo, quantizable_op_type, is_full_quantize, + is_use_cache_file, is_optimize_model, batch_size, quant_iterations) + + print("Start INT8 inference for {0} on {1} images ...".format( + model_name, infer_iterations * batch_size)) + (int8_throughput, int8_latency, int8_acc1) = self.run_program( + self.int8_model_path, batch_size, infer_iterations) + + print("---Post training quantization of {} method---".format(algo)) + print( + "FP32 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.". + format(model_name, batch_size, fp32_throughput, fp32_latency, + fp32_acc1)) + print( + "INT8 {0}: batch_size {1}, throughput {2} img/s, latency {3} s, acc1 {4}.\n". + format(model_name, batch_size, int8_throughput, int8_latency, + int8_acc1)) + sys.stdout.flush() + + delta_value = fp32_acc1 - int8_acc1 + self.assertLess(delta_value, diff_threshold) + + +class TestPostTrainingKLForWhile(TestPostTrainingQuantization): + def test_post_training_kl(self): + model_name = "mnist_while" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz" + data_md5 = "2387390beeb37b51dec041c27b8a681f" + algo = "KL" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +class TestPostTraininghistForWhile(TestPostTrainingQuantization): + def test_post_training_hist(self): + model_name = "mnist_while" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz" + data_md5 = "2387390beeb37b51dec041c27b8a681f" + algo = "hist" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +class TestPostTrainingmseForWhile(TestPostTrainingQuantization): + def test_post_training_mse(self): + model_name = "mnist_while" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz" + data_md5 = "2387390beeb37b51dec041c27b8a681f" + algo = "mse" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +class TestPostTrainingavgForWhile(TestPostTrainingQuantization): + def test_post_training_avg(self): + model_name = "mnist_while" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz" + data_md5 = "2387390beeb37b51dec041c27b8a681f" + algo = "avg" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +class TestPostTrainingMinMaxForWhile(TestPostTrainingQuantization): + def test_post_training_min_max(self): + model_name = "mnist_while" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz" + data_md5 = "2387390beeb37b51dec041c27b8a681f" + algo = "min_max" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +class TestPostTrainingAbsMaxForWhile(TestPostTrainingQuantization): + def test_post_training_abs_max(self): + model_name = "mnist_while" + data_url = "http://paddle-inference-dist.bj.bcebos.com/int8/mnist_while.tar.gz" + data_md5 = "2387390beeb37b51dec041c27b8a681f" + algo = "abs_max" + quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"] + is_full_quantize = False + is_use_cache_file = False + is_optimize_model = True + diff_threshold = 0.01 + batch_size = 10 + infer_iterations = 50 + quant_iterations = 5 + self.run_test(model_name, data_url, data_md5, algo, quantizable_op_type, + is_full_quantize, is_use_cache_file, is_optimize_model, + diff_threshold, batch_size, infer_iterations, + quant_iterations) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py index d8cb3e0918dc8d..10a9358612960c 100644 --- a/python/paddle/fluid/dataloader/dataloader_iter.py +++ b/python/paddle/fluid/dataloader/dataloader_iter.py @@ -273,6 +273,8 @@ def __next__(self): else: if self._return_list: data = self._reader.read_next_list() + for i in range(len(data)): + data[i] = data[i]._move_to_list() data = [ _restore_batch(d, s) for d, s in zip(data, self._structure_infos[:len( @@ -718,6 +720,8 @@ def __next__(self): else: if self._return_list: data = self._reader.read_next_list() + for i in range(len(data)): + data[i] = data[i]._move_to_list() data = [ _restore_batch(d, s) for d, s in zip(data, self._structure_infos[:len( diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index e601c1cb4c3005..d10564e21ea47b 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1999,6 +1999,14 @@ def _run_using_fleet_executor(self, fetch_list=fetch_list, feed_var_name=feed_var_name, fetch_var_name=fetch_var_name) + main_block = cached_program.block(0) + for op in main_block.ops: + # set the op_role of fetch op to Optimize to avoid + # erase the fetched vars by gc for pipeline + if op.type == 'fetch': + op._set_attr( + 'op_role', + core.op_proto_and_checker_maker.OpRole.Optimize) self._add_program_cache(cache_key, cached_program) if cached_ctx is None: fleet_opt = program._pipeline_opt["fleet_opt"] @@ -2007,6 +2015,18 @@ def _run_using_fleet_executor(self, self._add_ctx_cache(cache_key, cached_ctx) if feed: self._feed_data(cached_program, feed, feed_var_name, cached_scope) + + from paddle.optimizer.lr import LRScheduler + if hasattr(program, 'lr_sheduler'): + lr_sheduler = program.lr_sheduler + assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler" + lr_value = lr_sheduler() + lr_var = program.global_block().vars[lr_sheduler._var_name] + data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype)) + tensor = core.get_variable_tensor(cached_scope, + lr_sheduler._var_name) + tensor.set(data, self.place) + cached_ctx.run() if fetch_list: arr = cached_scope.find_var(fetch_var_name).get_fetch_list() diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py index dfc887292e7cff..83ccd1051bb669 100644 --- a/python/paddle/fluid/reader.py +++ b/python/paddle/fluid/reader.py @@ -1254,7 +1254,10 @@ def __iter__(self): def __next__(self): try: if self._return_list: - return self._reader.read_next_list() + data = self._reader.read_next_list() + for i in range(len(data)): + data[i] = data[i]._move_to_list() + return data else: return self._reader.read_next() except StopIteration: diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py index 05008a3bc12f7e..2b4002ab9c9d49 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2.py @@ -30,6 +30,7 @@ seed = 2021 epoch = 2 batch_size = 32 +linear_size = 10000 strategy = fleet.DistributedStrategy() strategy.hybrid_configs = { @@ -45,12 +46,12 @@ class MLP(fluid.Layer): - def __init__(self, param_attr=None, bias_attr=None): + def __init__(self, linear_size=10000, param_attr=None, bias_attr=None): super(MLP, self).__init__() - self._linear1 = Linear(10000, 10000) - self._linear2 = Linear(10000, 10000) - self._linear3 = Linear(10000, 10) + self._linear1 = Linear(linear_size, linear_size) + self._linear2 = Linear(linear_size, linear_size) + self._linear3 = Linear(linear_size, 10) def forward(self, inputs): y = self._linear1(inputs) @@ -59,10 +60,10 @@ def forward(self, inputs): return y -def reader_decorator(): +def reader_decorator(linear_size=10000): def __reader__(): for _ in range(100): - img = np.random.rand(10000).astype('float32') + img = np.random.rand(linear_size).astype('float32') label = np.ones(1).astype('int64') yield img, label @@ -120,6 +121,9 @@ def train_mlp(model, use_multiprocess=True) train_loader.set_sample_list_generator(train_reader) + if sharding_stage == 2: + model.to(device="gpu") + for eop in range(epoch): model.train() @@ -153,9 +157,6 @@ def train_mlp(model, if all_test and batch_id == 2: return model.parameters() - if sharding_stage == 2: - model.to(device="gpu") - return model.parameters() diff --git a/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py new file mode 100644 index 00000000000000..8adcda9d24e1ca --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_sharding_stage2_offload.py @@ -0,0 +1,115 @@ +# -*- coding: UTF-8 -*- + +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import ast +import time +import paddle +import paddle.fluid as fluid +from paddle.fluid.dygraph.nn import Linear +from paddle.distributed import fleet +from paddle.fluid.dygraph import nn + +from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2 +from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler + +from dygraph_sharding_stage2 import MLP, reader_decorator, optimizer_setting + +seed = 2021 +epoch = 2 +batch_size = 32 +linear_size = 8000 + +np.random.seed(seed) +paddle.seed(seed) + + +def train_mlp(model, offload=False): + group = paddle.distributed.new_group([0, 1]) + optimizer = optimizer_setting(model=model, use_pure_fp16=True) + + model = paddle.amp.decorate(models=model, level='O2', save_dtype='float32') + scaler = paddle.amp.GradScaler(init_loss_scaling=32768) + scaler = ShardingScaler(scaler, group) + + optimizer = ShardingOptimizerStage2( + params=model.parameters(), + optim=optimizer, + group=group, + offload=offload) + model = ShardingStage2(model, optimizer, group=group, accumulate_grads=True) + + train_reader = paddle.batch( + reader_decorator(linear_size), batch_size=batch_size, drop_last=True) + + train_loader = paddle.io.DataLoader.from_generator( + capacity=32, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + train_loader.set_sample_list_generator(train_reader) + + for eop in range(epoch): + model.train() + + for batch_id, data in enumerate(train_loader()): + img, label = data + label.stop_gradient = True + img.stop_gradient = True + + with paddle.amp.auto_cast(True, level='O2'): + out = model(img) + loss = paddle.nn.functional.cross_entropy( + input=out, label=label) + + avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32)) + scaler.scale(avg_loss).backward() + + model.grad_scale() + scaler.step(optimizer) + scaler.update() + model.clear_gradients() + + for dtype in optimizer.param_storages: + for dst_rank, param_storage in optimizer.param_storages[dtype].items(): + param_storage.to(device="gpu", dtype=dtype) + + return model.parameters() + + +def test_sharding_stage2_offload(): + mlp = MLP(linear_size) + mlp_offload = MLP(linear_size) + mlp_offload.set_state_dict(mlp.state_dict()) + + mlp_params = train_mlp(mlp, offload=False) + mlp_offload_params = train_mlp(mlp_offload, offload=True) + + for i in range(len(mlp_params)): + for j in range(len(mlp_offload_params)): + if mlp_params[i].name == mlp_offload_params[j].name: + np.testing.assert_allclose( + mlp_params[i].numpy(), + mlp_offload_params[j].numpy(), + rtol=1e-6) + return + + +if __name__ == '__main__': + test_sharding_stage2_offload() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py index 33754fac127a44..c05ad30da2797f 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py @@ -322,14 +322,14 @@ def assert_op_list(self, op_list_after_fusion): "Expected operator list after fusion is {}, but now it's {}".format( op_list_after_fusion, after_op_list), ) - def run_and_statis( - self, - quant=False, - max_examples=100, - reproduce=None, - min_success_num=25, - max_duration=180, - passes=None, ): + def run_and_statis(self, + quant=False, + max_examples=100, + reproduce=None, + min_success_num=25, + max_duration=180, + passes=None, + use_gpu_run_baseline=False): if os.getenv('HYPOTHESIS_TEST_PROFILE', 'ci') == "dev": max_examples *= 10 min_success_num *= 10 @@ -354,7 +354,10 @@ def program_generator(draw): return self.sample_program_config(draw) def run_test(prog_config): - return self.run_test(quant=quant, prog_configs=[prog_config]) + return self.run_test( + quant=quant, + prog_configs=[prog_config], + use_gpu_run_baseline=use_gpu_run_baseline) generator = st.composite(program_generator) loop_func = given(generator())(run_test) @@ -371,8 +374,8 @@ def run_test(prog_config): logging.info("Number of Ran Programs: {}".format(self.num_ran_programs)) logging.info("Number of Ignore Tests: {}".format(self.num_ignore_tests)) successful_ran_programs = int(self.num_ran_programs - - self.num_ignore_tests / - self.num_predictor_kinds) + self.num_ignore_tests / max( + self.num_predictor_kinds, 1)) logging.info( "Number of successfully ran programs approximately equal to {}". format(successful_ran_programs)) @@ -391,7 +394,10 @@ def run_test(prog_config): format(max_duration)) assert False - def run_test(self, quant=False, prog_configs=None): + def run_test(self, + quant=False, + prog_configs=None, + use_gpu_run_baseline=False): status = True for prog_config in prog_configs: @@ -413,7 +419,9 @@ def run_test(self, quant=False, prog_configs=None): results: List[Dict[str, np.ndarray]] = [] # baseline: cpu no ir_optim run - base_config = self.create_inference_config(ir_optim=False) + + base_config = self.create_inference_config( + ir_optim=False, use_gpu=use_gpu_run_baseline) logging.info('RUN program_config: ' + str(prog_config)) results.append( self.run_test_config(model, params, prog_config, base_config, diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py index 8cb6af1dcf0441..96c2a175208faa 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_adaptive_pool2d_convert_global_pass_autoscan.py @@ -109,7 +109,7 @@ def teller2(program_config, predictor_config): def test(self): self.run_and_statis( quant=False, - max_examples=100, + max_examples=300, passes=["adaptive_pool2d_convert_global_pass"], min_success_num=40) diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py new file mode 100644 index 00000000000000..2ccb9de5d5470f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_fc_elementwise_layernorm_fuse_pass.py @@ -0,0 +1,135 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import PassAutoScanTest, IgnoreReasons +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + +import hypothesis +from hypothesis import given, settings, seed, example, assume, reproduce_failure +import hypothesis.strategies as st + + +class TestFCElementwiseLayerNormFusePass(PassAutoScanTest): + """ + x_var w(persistable) bias_var(persistable) + \ | / + fc + | + fc_out_var bias_var(persistable) + \ / + elementwise_add bias_var(persistable) scale_var(persistable) + \ | / + layer_norm + / | \ + Y mean_var variance_var + """ + + def sample_predictor_configs(self, program_config): + # for gpu + config = self.create_inference_config(use_gpu=True) + yield config, ["fused_fc_elementwise_layernorm"], (1e-5, 1e-5) + + def sample_program_config(self, draw): + # 1. Generate shape of input:X of fc + x_shape = draw( + st.lists( + st.integers( + min_value=1, max_value=8), min_size=2, max_size=5)) + x_shape = [2, 1] + x_rank = len(x_shape) + # 2. Generate attr:in_num_col_dims of fc + in_num_col_dims = draw(st.integers(min_value=1, max_value=x_rank - 1)) + # 3. Generate legal shape of input:W/bias of fc + w_shape = draw( + st.lists( + st.integers( + min_value=1, max_value=8), min_size=2, max_size=2)) + w_shape[0] = int(np.prod(x_shape[in_num_col_dims:])) + w_shape = [1, 2] + fc_bias_shape = [w_shape[1], ] + if draw(st.booleans()): + fc_bias_shape.insert(0, 1) + fc_bias_shape = [2, ] + fc_out_shape = x_shape[:in_num_col_dims] + w_shape[1:] + # 4. Generate legal attr:axis/shape of elementwise_add + add_bias_shape = fc_out_shape[:] + axis = draw(st.integers(min_value=-1, max_value=0)) + # 5. Generate legal shape of layer_norm + begin_norm_axis = draw( + st.integers( + min_value=1, max_value=len(fc_out_shape) - 1)) + layer_norm_shape = [int(np.prod(fc_out_shape[begin_norm_axis:]))] + epsilon = 1e-5 + + fc_op = OpConfig( + "fc", + inputs={"Input": ["fc_x"], + "W": ["fc_w"], + "Bias": ["fc_bias"]}, + outputs={"Out": ["fc_out"]}, + in_num_col_dims=in_num_col_dims, + padding_weights=False, + activation_type="", + use_quantizer=False, + use_mkldnn=False, ) + add_op = OpConfig( + "elementwise_add", + inputs={"X": ["fc_out"], + "Y": ["add_bias"]}, + outputs={"Out": ["add_out"]}, + axis=axis, ) + layer_norm_op = OpConfig( + "layer_norm", + inputs={ + "X": ["add_out"], + "Scale": ["scale"], + "Bias": ["layer_norm_bias"] + }, + outputs={ + "Y": ["layer_norm_out"], + "Mean": ["layer_norm_mean"], + "Variance": ["layer_norm_var"] + }, + begin_norm_axis=begin_norm_axis, + epsilon=epsilon) + + ops = [fc_op, add_op, layer_norm_op] + program_config = ProgramConfig( + ops=ops, + weights={ + "fc_w": TensorConfig(shape=w_shape), + "fc_bias": TensorConfig(shape=fc_bias_shape), + "add_bias": TensorConfig(shape=add_bias_shape), + "scale": TensorConfig(shape=layer_norm_shape), + "layer_norm_bias": TensorConfig(shape=layer_norm_shape), + }, + inputs={"fc_x": TensorConfig(shape=x_shape), }, + outputs=ops[-1].outputs["Y"], ) + return program_config + + def test(self): + self.run_and_statis( + quant=False, + max_examples=300, + passes=["fc_elementwise_layernorm_fuse_pass"], + use_gpu_run_baseline=True) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py index 83d4b7091cb327..64c3042b63cf8e 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_transpose_flatten_concat_fuse_pass.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,72 +12,147 @@ # See the License for the specific language governing permissions and # limitations under the License. +from auto_scan_test import PassAutoScanTest, IgnoreReasons +from program_config import TensorConfig, ProgramConfig, OpConfig +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set import unittest -import numpy as np -from inference_pass_test import InferencePassTest -import paddle.fluid as fluid -import paddle.fluid.core as core -from paddle.fluid.core import PassVersionChecker +import hypothesis +from hypothesis import given, settings, seed, example, assume, reproduce_failure +import hypothesis.strategies as st -class TransposeFlattenConcatFusePassTest(InferencePassTest): - def setUp(self): - with fluid.program_guard(self.main_program, self.startup_program): - data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32") - data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32") - trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0]) - trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0]) - flatt1 = fluid.layers.flatten(trans1) - flatt2 = fluid.layers.flatten(trans2) - concat_out = fluid.layers.concat([flatt1, flatt2]) - # There is no parameters for above structure. - # Hence, append a batch_norm to avoid failure caused by load_combined. - out = fluid.layers.batch_norm(concat_out, is_test=True) - self.feeds = { - "data1": np.random.random([5, 5, 5]).astype("float32"), - "data2": np.random.random([5, 5, 5]).astype("float32") - } - self.fetch_list = [out] +class TestTransposeFlattenConcatFusePass(PassAutoScanTest): + """ + x_1_var x_2_var + | | + transpose2 transpose2 + | | + flatten2 flatten2 + \ / + flatten2_out_var flatten2_out_var + \ / + concat + """ - def test_check_output(self): - # There is no cpu pass for transpose_flatten_concat_fuse - if core.is_compiled_with_cuda(): - use_gpu = True - self.check_output_with_option(use_gpu) + def sample_predictor_configs(self, program_config): + # TRT + # after tensorrt_subgraph_pass ,The pass needs to be deleted on TRT - PassVersionChecker.IsCompatible('transpose_flatten_concat_fuse_pass') + # for gpu + config = self.create_inference_config(use_gpu=True) + yield config, ["fusion_transpose_flatten_concat", ], (1e-5, 1e-5) + def is_program_valid(self, prog_config): + concat_axis = prog_config.ops[-1].attrs["axis"] + ops_num = len(prog_config.ops) - 1 + if ops_num % 2 != 0: + return False + input_num = ops_num // 2 + flatten_shape = 0 + x_trans_axis = prog_config.ops[0].attrs["axis"] + x_flatten_axis = prog_config.ops[1].attrs["axis"] + for i in range(input_num): + input_name = "transpose2_x" + str(i) + input_shape = prog_config.inputs[input_name].shape + trans_axis = prog_config.ops[i * 2].attrs["axis"] + if x_trans_axis != trans_axis: + return False + # calculate shape after transpose + input_shape = [input_shape[j] for j in trans_axis] + # calculate shape after flateen + flatten_axis = prog_config.ops[i * 2 + 1].attrs["axis"] + if x_flatten_axis != flatten_axis: + return False + flatten_shape1 = flatten_shape2 = 1 + for j in range(len(input_shape)): + if j < flatten_axis: + flatten_shape1 *= input_shape[j] + else: + flatten_shape2 *= input_shape[j] + if concat_axis == 0: + if i == 0: + flatten_shape = flatten_shape2 + elif flatten_shape != flatten_shape2: + return False + else: + if i == 0: + flatten_shape = flatten_shape1 + elif flatten_shape != flatten_shape1: + return False + return True -class TransposeFlattenConcatFusePassWithAxisTest(InferencePassTest): - def setUp(self): - with fluid.program_guard(self.main_program, self.startup_program): - data1 = fluid.data(name="data1", shape=[5, 5, 5], dtype="float32") - data2 = fluid.data(name="data2", shape=[5, 5, 5], dtype="float32") - trans1 = fluid.layers.transpose(data1, perm=[2, 1, 0]) - trans2 = fluid.layers.transpose(data2, perm=[2, 1, 0]) - flatt1 = fluid.layers.flatten(trans1, axis=2) - flatt2 = fluid.layers.flatten(trans2, axis=2) - concat_out = fluid.layers.concat([flatt1, flatt2], axis=1) - # There is no parameters for above structure. - # Hence, append a batch_norm to avoid failure caused by load_combined. - out = fluid.layers.batch_norm(concat_out, is_test=True) + def sample_program_config(self, draw): + times = draw(st.integers(min_value=1, max_value=6)) + concat_axis = draw(st.integers(min_value=0, max_value=1)) + ops = [] + concat_input = [] + inputs = {} + x_shape_rank = draw(st.integers(min_value=2, max_value=5)) + # Generate axis of transpose + trans_axis = [j for j in range(x_shape_rank)] + for j in range(x_shape_rank - 1): + if draw(st.booleans()): + trans_axis[j], trans_axis[-1] = trans_axis[-1], trans_axis[j] + # Generate axis of flatten + flatten_axis = draw( + st.integers( + min_value=0, max_value=x_shape_rank - 1)) + for i in range(times): + # Generate x_shape of transpose + x_shape = draw( + st.lists( + st.integers( + min_value=1, max_value=10), + min_size=x_shape_rank, + max_size=x_shape_rank)) - self.feeds = { - "data1": np.random.random([5, 5, 5]).astype("float32"), - "data2": np.random.random([5, 5, 5]).astype("float32") - } - self.fetch_list = [out] + str_i = str(i) + transpose_op = OpConfig( + "transpose2", + inputs={"X": ["transpose2_x" + str_i], }, + axis=trans_axis, + outputs={ + "Out": ["trans_out" + str_i], + "XShape": ["trans_shape" + str_i] + }, ) + ops.append(transpose_op) + flatten_op = OpConfig( + "flatten2", + inputs={"X": ["trans_out" + str_i], }, + axis=flatten_axis, + outputs={ + "Out": ["flatten2_out" + str_i], + "XShape": ["xshape" + str_i] + }, ) + concat_input.append("flatten2_out" + str_i) + ops.append(flatten_op) + inputs["transpose2_x" + str_i] = TensorConfig(shape=x_shape) - def test_check_output(self): - # There is no cpu pass for transpose_flatten_concat_fuse - if core.is_compiled_with_cuda(): - use_gpu = True - self.check_output_with_option(use_gpu) + concat_op = OpConfig( + "concat", + inputs={ + "X": concat_input, + "AxisTensor": [], + }, + outputs={"Out": ["concat_out"]}, + axis=concat_axis, ) - self.assertTrue( - PassVersionChecker.IsCompatible( - 'transpose_flatten_concat_fuse_pass')) + ops.append(concat_op) + + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs=inputs, + outputs=ops[-1].outputs["Out"], ) + return program_config + + def test(self): + self.run_and_statis( + quant=False, + max_examples=300, + passes=["transpose_flatten_concat_fuse_pass"]) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_distribution.py b/python/paddle/fluid/tests/unittests/test_distribution.py index f1c12c90490c25..6cf2c5f6e2ca4b 100644 --- a/python/paddle/fluid/tests/unittests/test_distribution.py +++ b/python/paddle/fluid/tests/unittests/test_distribution.py @@ -336,6 +336,29 @@ def init_static_data(self, batch_size, dims): name='values', shape=[dims], dtype='float32') +class UniformTestSample(unittest.TestCase): + def setUp(self): + self.init_param() + + def init_param(self): + self.low = 3.0 + self.high = 4.0 + + def test_uniform_sample(self): + paddle.disable_static() + uniform = Uniform(low=self.low, high=self.high) + s = uniform.sample([100]) + self.assertTrue((s >= self.low).all()) + self.assertTrue((s < self.high).all()) + paddle.enable_static() + + +class UniformTestSample2(UniformTestSample): + def init_param(self): + self.low = -5.0 + self.high = 2.0 + + class NormalNumpy(DistributionNumpy): def __init__(self, loc, scale): self.loc = np.array(loc) diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py index c5cf8c5d5ed690..f76dcb5687c2ab 100644 --- a/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py +++ b/python/paddle/fluid/tests/unittests/test_dygraph_sharding_stage2.py @@ -26,6 +26,9 @@ class TestDygraphShardingStage2(TestMultipleGpus): def test_dygraph_sharding_optimizer_stage2(self): self.run_mnist_2gpu('dygraph_sharding_stage2.py') + def test_dygraph_sharding_optimizer_stage2_offload(self): + self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py') + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor.py index 09f9fa6ce105df..fbc5db341e5e9b 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_executor.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_executor.py @@ -47,6 +47,18 @@ def run_fleet_executor(self, place, x_data, y_data): name='y', shape=y_data.shape, dtype=y_data.dtype) z = x + y a = 2 * x + 3 * y + loss = paddle.mean(a) + base_lr = 0.1 + passes = [30, 60, 80, 90] + steps_per_pass = 10 + bd = [steps_per_pass * p for p in passes] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + lr_val = paddle.optimizer.lr.PiecewiseDecay( + boundaries=bd, values=lr) + opt = paddle.optimizer.AdamW( + learning_rate=lr_val, + grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)) + opt.minimize(loss) # TODO: section_program will be removed in the future empty_program._pipeline_opt = { "fleet_opt": self.fake_fleet_opt(), diff --git a/python/paddle/fluid/tests/unittests/test_gcd.py b/python/paddle/fluid/tests/unittests/test_gcd.py new file mode 100644 index 00000000000000..820216dc56cd60 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_gcd.py @@ -0,0 +1,93 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard +from op_test import OpTest + +paddle.enable_static() + + +class TestGcdAPI(unittest.TestCase): + def setUp(self): + self.x_np = 12 + self.y_np = 20 + self.x_shape = [1] + self.y_shape = [1] + + def test_static_graph(self): + startup_program = fluid.Program() + train_program = fluid.Program() + with fluid.program_guard(startup_program, train_program): + x = fluid.data(name='input1', dtype='int32', shape=self.x_shape) + y = fluid.data(name='input2', dtype='int32', shape=self.y_shape) + out = paddle.gcd(x, y) + + place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + exe = fluid.Executor(place) + res = exe.run(fluid.default_main_program(), + feed={'input1': self.x_np, + 'input2': self.y_np}, + fetch_list=[out]) + self.assertTrue((np.array(res[0]) == np.gcd(self.x_np, self.y_np) + ).all()) + + def test_dygraph(self): + paddle.disable_static() + x = paddle.to_tensor(self.x_np) + y = paddle.to_tensor(self.y_np) + result = paddle.gcd(x, y) + self.assertEqual( + np.allclose(np.gcd(self.x_np, self.y_np), result.numpy()), True) + + paddle.enable_static() + + +class TestGcdAPI2(TestGcdAPI): + def setUp(self): + self.x_np = np.arange(6).astype(np.int32) + self.y_np = np.array([20]).astype(np.int32) + self.x_shape = [6] + self.y_shape = [1] + + +class TestGcdAPI3(TestGcdAPI): + def setUp(self): + self.x_np = 0 + self.y_np = 20 + self.x_shape = [1] + self.y_shape = [1] + + +class TestGcdAPI4(TestGcdAPI): + def setUp(self): + self.x_np = 0 + self.y_np = 0 + self.x_shape = [1] + self.y_shape = [1] + + +class TestGcdAPI5(TestGcdAPI): + def setUp(self): + self.x_np = 12 + self.y_np = -20 + self.x_shape = [1] + self.y_shape = [1] diff --git a/python/paddle/fluid/tests/unittests/test_lcm.py b/python/paddle/fluid/tests/unittests/test_lcm.py new file mode 100644 index 00000000000000..123c3e3d444e1b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_lcm.py @@ -0,0 +1,93 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, program_guard +from op_test import OpTest + +paddle.enable_static() + + +class TestLcmAPI(unittest.TestCase): + def setUp(self): + self.x_np = 12 + self.y_np = 20 + self.x_shape = [1] + self.y_shape = [1] + + def test_static_graph(self): + startup_program = fluid.Program() + train_program = fluid.Program() + with fluid.program_guard(startup_program, train_program): + x1 = fluid.data(name='input1', dtype='int32', shape=self.x_shape) + x2 = fluid.data(name='input2', dtype='int32', shape=self.y_shape) + out = paddle.lcm(x1, x2) + + place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + exe = fluid.Executor(place) + res = exe.run(fluid.default_main_program(), + feed={'input1': self.x_np, + 'input2': self.y_np}, + fetch_list=[out]) + self.assertTrue((np.array(res[0]) == np.lcm(self.x_np, self.y_np) + ).all()) + + def test_dygraph(self): + paddle.disable_static() + x1 = paddle.to_tensor(self.x_np) + x2 = paddle.to_tensor(self.y_np) + result = paddle.lcm(x1, x2) + self.assertEqual( + np.allclose(np.lcm(self.x_np, self.y_np), result.numpy()), True) + + paddle.enable_static() + + +class TestLcmAPI2(TestLcmAPI): + def setUp(self): + self.x_np = np.arange(6).astype(np.int32) + self.y_np = np.array([20]).astype(np.int32) + self.x_shape = [6] + self.y_shape = [1] + + +class TestLcmAPI3(TestLcmAPI): + def setUp(self): + self.x_np = 0 + self.y_np = 20 + self.x_shape = [1] + self.y_shape = [1] + + +class TestLcmAPI4(TestLcmAPI): + def setUp(self): + self.x_np = 0 + self.y_np = 0 + self.x_shape = [1] + self.y_shape = [1] + + +class TestLcmAPI5(TestLcmAPI): + def setUp(self): + self.x_np = 12 + self.y_np = -20 + self.x_shape = [1] + self.y_shape = [1] diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index c5bc99546c878a..36dfd717a12a0f 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -197,6 +197,8 @@ from .math import lerp_ # noqa: F401 from .math import rad2deg # noqa: F401 from .math import deg2rad # noqa: F401 +from .math import gcd # noqa: F401 +from .math import lcm # noqa: F401 from .math import diff # noqa: F401 from .math import angle # noqa: F401 @@ -414,6 +416,10 @@ 'triangular_solve', 'as_complex', 'as_real', + 'rad2deg', + 'deg2rad', + 'gcd', + 'lcm', 'diff', 'lerp', 'lerp_', diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index fdd81fa8c0f9eb..fefaecaf604a07 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -2624,9 +2624,9 @@ def lerp(x, y, weight, name=None): lerp(x, y, weight) = x + weight * (y - x). Args: - x (Tensor): An N-D Tensor, the data type is float32, float64. - y (Tensor): An N-D Tensor, the data type is float32, float64. - weight (float|Tensor): the weight for the interpolation formula. + x (Tensor): An N-D Tensor with starting points, the data type is float32, float64. + y (Tensor): An N-D Tensor with ending points, the data type is float32, float64. + weight (float|Tensor): The weight for the interpolation formula. When weight is Tensor, the data type is float32, float64. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -2788,6 +2788,139 @@ def deg2rad(x, name=None): type='scale', inputs={'X':out_cast}, outputs={'Out': out}, attrs={'scale': deg2rad_scale}) return out +def gcd(x, y, name=None): + """ + Computes the element-wise greatest common divisor (GCD) of input |x| and |y|. + Both x and y must have integer types. + + Note: + gcd(0,0)=0, gcd(0, y)=|y| + + Args: + x, y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8. + If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output). + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + out (Tensor): An N-D Tensor, the data type is the same with input. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + x1 = paddle.to_tensor(12) + x2 = paddle.to_tensor(20) + paddle.gcd(x1, x2) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [4]) + + x3 = paddle.to_tensor(np.arange(6)) + paddle.gcd(x3, x2) + # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [20, 1 , 2 , 1 , 4 , 5]) + + x4 = paddle.to_tensor(0) + paddle.gcd(x4, x2) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [20]) + + paddle.gcd(x4, x4) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [0]) + + x5 = paddle.to_tensor(-20) + paddle.gcd(x1, x5) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [4]) + """ + shape = paddle.broadcast_shape(x.shape, y.shape) + x = paddle.broadcast_to(x, shape) + y = paddle.broadcast_to(y, shape) + x = paddle.abs(x) + y = paddle.abs(y) + + def _gcd_cond_fn(x, y): + return paddle.any(y != 0) + + def _gcd_body_fn(x, y): + # paddle.mod will raise an error when any element of y is 0. To avoid + # that, we change those zeros to ones. Their values don't matter because + # they won't be used. + y_not_equal_0 = (y != 0) + y_safe = paddle.where(y_not_equal_0, y, paddle.ones(y.shape, y.dtype)) + x, y = (paddle.where(y_not_equal_0, y, x), + paddle.where(y_not_equal_0, paddle.mod(x, y_safe),paddle.zeros(y.shape, y.dtype))) + return (paddle.where(x < y, y, x), paddle.where(x < y, x, y)) + + if in_dygraph_mode(): + while _gcd_cond_fn(x, y): + x, y = _gcd_body_fn(x, y) + + return x + else: + check_variable_and_dtype(x, 'x', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd') + check_variable_and_dtype(y, 'y', ['int32', 'int64', 'int8', 'int16', 'uint8'], 'gcd') + out, _ = paddle.static.nn.while_loop(_gcd_cond_fn, _gcd_body_fn, [x, y]) + return out + +def lcm(x, y, name=None): + """ + Computes the element-wise least common multiple (LCM) of input |x| and |y|. + Both x and y must have integer types. + + Note: + lcm(0,0)=0, lcm(0, y)=0 + + Args: + x, y (Tensor): An N-D Tensor, the data type is int8,int16,int32,int64,uint8. + If x.shape != y.shape, they must be broadcastable to a common shape (which becomes the shape of the output). + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + out (Tensor): An N-D Tensor, the data type is the same with input. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + x1 = paddle.to_tensor(12) + x2 = paddle.to_tensor(20) + paddle.lcm(x1, x2) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [60]) + + x3 = paddle.to_tensor(np.arange(6)) + paddle.lcm(x3, x2) + # Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [0, 20, 20, 60, 20, 20]) + + x4 = paddle.to_tensor(0) + paddle.lcm(x4, x2) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [0]) + + paddle.lcm(x4, x4) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [0]) + + x5 = paddle.to_tensor(-20) + paddle.lcm(x1, x5) + # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [60]) + """ + d = paddle.gcd(x, y) + # paddle.mod will raise an error when any element of y is 0. To avoid + # that, we change those zeros to ones. Their values don't matter because + # they won't be used. + d_equal_0 = paddle.equal(d, 0) + d_safe = paddle.where(d_equal_0, paddle.ones(d.shape, d.dtype), d) + out = paddle.where(d_equal_0, paddle.zeros(d.shape, d.dtype), paddle.abs(x * y) // d_safe) + return out + def diff(x, n=1, axis=-1, prepend=None, append=None, name=None): r""" Computes the n-th forward difference along the given axis. diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 5506ee95bd7c9e..ed3bb1dc5f1f01 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -345,6 +345,7 @@ def source_include(header_file_path): #include "glog/logging.h" #include "paddle/pten/api/lib/api_registry.h" +#include "paddle/pten/api/lib/kernel_declare.h" #include "paddle/pten/api/lib/kernel_dispatch.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/kernel_registry.h" @@ -353,22 +354,6 @@ def source_include(header_file_path): """ -def module_declare(): - return """ -PT_DECLARE_MODULE(CreationCPU); -PT_DECLARE_MODULE(LinalgCPU); -PT_DECLARE_MODULE(ManipulationCPU); -PT_DECLARE_MODULE(MathCPU); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_DECLARE_MODULE(CreationCUDA); -PT_DECLARE_MODULE(LinalgCUDA); -PT_DECLARE_MODULE(ManipulationCUDA); -PT_DECLARE_MODULE(MathCUDA); -#endif -""" - - def api_register(): return """ PT_REGISTER_API(Creation); @@ -405,7 +390,6 @@ def generate_api(api_yaml_path, header_file_path, source_file_path): include_header_file = "paddle/pten/api/include/api.h" source_file.write(source_include(include_header_file)) - source_file.write(module_declare()) source_file.write(namespace[0]) for api in apis: diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py index 79a742c314bd07..46b415e24d8ca2 100644 --- a/tools/parallel_UT_rule.py +++ b/tools/parallel_UT_rule.py @@ -202,7 +202,7 @@ 'test_fleet_runtime', 'test_rnn_cudnn_params_packing', 'test_mkldnn_placement_pass', - 'test_fc_elementwise_layernorm_fuse_pass', + 'test_fc_elementwise_layernorm_fuse_pass_cc', 'program_desc_test', 'test_simplify_with_basic_ops_pass', 'test_dygraph_mode_of_unittest', @@ -1417,7 +1417,7 @@ 'test_fc_mkldnn_op', 'test_fc_lstm_fuse_pass', 'test_fc_gru_fuse_pass', - 'test_fc_elementwise_layernorm_fuse_pass', + 'test_fc_elementwise_layernorm_fuse_pass_cc', 'test_fc_bf16_mkldnn_op', 'test_executor_feed_non_tensor', 'test_executor_check_feed',