Merge branch 'develop' into xpu_pass

PaddlePaddle · Sep 4, 2023 · 25b621a · 25b621a
2 parents 2c0f5cf + c089a2a
commit 25b621a
Show file tree

Hide file tree

Showing 295 changed files with 7,134 additions and 3,370 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -12,7 +12,7 @@ bugprone-exception-escape,
 -bugprone-fold-init-type,
 -bugprone-forwarding-reference-overload,
 -bugprone-inaccurate-erase,
--bugprone-incorrect-roundings,
+bugprone-incorrect-roundings,
 -bugprone-infinite-loop,
 bugprone-integer-division,
 -bugprone-macro-repeated-side-effects,
@@ -186,7 +186,7 @@ modernize-use-equals-default,
 -modernize-use-noexcept,
 modernize-use-nullptr,
 modernize-use-override,
--modernize-use-transparent-functors,
+modernize-use-transparent-functors,
 -modernize-use-uncaught-exceptions,
 performance-faster-string-find,
 -performance-for-range-copy,
@@ -197,7 +197,7 @@ performance-inefficient-string-concatenation,
 -performance-move-const-arg,
 -performance-move-constructor-init,
 -performance-no-automatic-move,
--performance-noexcept-move-constructor,
+performance-noexcept-move-constructor,
 -performance-trivially-destructible,
 -performance-type-promotion-in-math-fn,
 -performance-unnecessary-copy-initialization,

diff --git a/.gitignore b/.gitignore
@@ -72,7 +72,7 @@ tools/nvcc_lazy
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
-paddle/fluid/pybind/eager_op_function.cc
+paddle/fluid/pybind/eager_op_function.*
 tools/nvcc_lazy
 paddle/phi/kernels/sparse/gpu/cutlass_generator/all_gemm_operations.h
 paddle/phi/kernels/sparse/gpu/cutlass_generator/configurations.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -605,6 +605,7 @@ if(WITH_CINN)
   add_definitions(-DPADDLE_WITH_CINN)
 
   if(CINN_ONLY)
+    add_definitions(-DCINN_WITH_ONLY)
     if(WITH_PYTHON)
       add_subdirectory(python)
     endif()

diff --git a/cmake/cinn.cmake b/cmake/cinn.cmake
@@ -168,8 +168,8 @@ cinn_cc_library(
 add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
 add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
 if(NOT CINN_ONLY)
-  target_link_libraries(cinnapi phi)
-  add_dependencies(cinnapi phi)
+  target_link_libraries(cinnapi pd_dialect phi)
+  add_dependencies(cinnapi pd_dialect phi)
 endif()
 
 target_link_libraries(cinnapi ${PYTHON_LIBRARIES})
@@ -226,8 +226,8 @@ function(gen_cinncore LINKTYPE)
   add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
   add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
   if(NOT CINN_ONLY)
-    target_link_libraries(${CINNCORE_TARGET} phi)
-    add_dependencies(${CINNCORE_TARGET} phi)
+    target_link_libraries(${CINNCORE_TARGET} pd_dialect phi)
+    add_dependencies(${CINNCORE_TARGET} pd_dialect phi)
   endif()
 
   add_dependencies(${CINNCORE_TARGET} pybind)

diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
@@ -20,16 +20,12 @@ set(CBLAS_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/openblas)
 set(CBLAS_TAG v0.3.7)
 set(CMAKE_VERBOSE_MAKEFILE 1)
 
-# Why use v0.3.18?  The IDG business line encountered a random openblas error,
-# which can be resolved after upgrading openblas.
-# And why compile when gcc>8.2? Please refer to
-# https://github.com/spack/spack/issues/19932#issuecomment-733452619
-# v0.3.18 only support gcc>=8.3 or gcc>=7.4
-if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-   AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.2
+# OpenBLAS support Raptor Lake from v0.3.22
+if(UNIX
+   AND NOT APPLE
+   AND NOT WITH_ROCM
    AND NOT WITH_XPU)
-  # We only compile with openblas 0.3.18 when gcc >= 8.3
-  set(CBLAS_TAG v0.3.18)
+  set(CBLAS_TAG v0.3.23)
 endif()
 
 if(APPLE AND WITH_ARM)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
@@ -141,6 +141,11 @@ if(NOT WIN32)
   set(COMMON_FLAGS
       -fPIC
       -fno-omit-frame-pointer
+      -pipe
+      -ffunction-sections
+      -fdata-sections
+      -Wl
+      -gc-sections
       -Werror
       -Wall
       -Wextra

diff --git a/paddle/cinn/auto_schedule/auto_tuner.cc b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -63,8 +63,8 @@ void AutoTuner::Initialize(const Config& config,
   const auto& shape_dict = graph_->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
 
-  op_lowerer_ = std::make_unique<hlir::framework::OpLowerer>(
-      dtype_dict, shape_dict, target_);
+  op_lowerer_ = std::make_unique<hlir::framework::OpLowerer<GroupPtr>>(
+      new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target_));
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   for (auto i = 0; i < tasks_.size(); ++i) {
     auto&& task = tasks_[i];

diff --git a/paddle/cinn/auto_schedule/auto_tuner.h b/paddle/cinn/auto_schedule/auto_tuner.h
@@ -30,11 +30,11 @@
 
 namespace cinn {
 namespace auto_schedule {
-
 // This class is entrance of auto-tune, users can use it
 // to tune graph (not supported yet) and search a series of schedules
 // that maybe more likely to obtain better performance.
 // Internally, it creates necessary components and use them to perform tuning.
+using GroupPtr = hlir::framework::GroupPtr;
 class AutoTuner {
  public:
   // configure how to perform auto-tune, such as
@@ -58,7 +58,7 @@ class AutoTuner {
  private:
   const common::Target& target_;
   hlir::framework::Graph* graph_;
-  std::unique_ptr<hlir::framework::OpLowerer> op_lowerer_;
+  std::unique_ptr<hlir::framework::OpLowerer<GroupPtr>> op_lowerer_;
 
   // Tasks to tune
   std::vector<TuneTask> tasks_;

diff --git a/paddle/cinn/auto_schedule/measure/measurer_test.cc b/paddle/cinn/auto_schedule/measure/measurer_test.cc
@@ -26,6 +26,7 @@
 #include "paddle/cinn/frontend/syntax.h"
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
 #include "paddle/cinn/hlir/framework/graph_compiler_util.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 #include "paddle/cinn/runtime/flags.h"
 
 namespace cinn {
@@ -75,12 +76,12 @@ class TestMeasurer : public ::testing::Test {
         absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
         "infershape");
 
-    auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
-        dtype_dict, shape_dict, target);
+    auto op_lowerer =
+        hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
     inputs.reserve(tasks.size());
     for (int i = 0; i < tasks.size(); ++i) {
       auto* task = &tasks[i];
-      task->Initialize(shape_dict, dtype_dict, op_lowerer.get());
+      task->Initialize(shape_dict, dtype_dict, &op_lowerer);
       MeasureInput input;
       input.task = task;
       input.lowered_funcs = task->lowered_funcs;

diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_inline_test.cc
@@ -161,14 +161,14 @@ TEST(AutoInline, AddReluInline) {
           "inferdtype");
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
-  auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
-      dtype_dict, shape_dict, target);
+  auto op_lowerer =
+      hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
 
   EXPECT_EQ(graph->fusion_groups.size(), 1UL);
   std::vector<ir::LoweredFunc> funcs =
-      op_lowerer->Lower(graph->fusion_groups[0],
-                        /*apply_op_schedule = */ false,
-                        /*apply_group_schedule=*/false);
+      op_lowerer.Lower(graph->fusion_groups[0],
+                       /*apply_op_schedule = */ false,
+                       /*apply_group_schedule=*/false);
 
   VLOG(6) << "Expr before auto inline: " << funcs[0]->body;
 

diff --git a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.cc
@@ -61,7 +61,8 @@ ir::IRSchedule TestAutoGenRuleBase::MakeIRSchedule(
           "inferdtype");
   auto& shape_dict = graph->GetMutableAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
-  hlir::framework::OpLowerer op_lowerer(dtype_dict, shape_dict, target_);
+  auto op_lowerer =
+      hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target_);
 
   lowered_funcs_ =
       op_lowerer.Lower(graph->fusion_groups.front(),

diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search_test.cc
@@ -27,6 +27,7 @@
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/auto_schedule/task/tune_task.h"
 #include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "test/cpp/cinn/program_builder.h"
@@ -44,11 +45,11 @@ std::vector<TuneTask> CreateTasks(const frontend::Program& program,
           "inferdtype");
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
-  auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
-      dtype_dict, shape_dict, target);
+  auto op_lowerer =
+      hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   for (auto i = 0; i < tasks.size(); ++i) {
-    tasks[i].Initialize(shape_dict, dtype_dict, op_lowerer.get());
+    tasks[i].Initialize(shape_dict, dtype_dict, &op_lowerer);
     task_registry->Regist(tasks[i].serialized_key,
                           ir::ModuleExpr(tasks[i].GetLoweredFuncBodyExprs()));
   }

diff --git a/paddle/cinn/auto_schedule/task/task_registry_test.cc b/paddle/cinn/auto_schedule/task/task_registry_test.cc
@@ -45,11 +45,10 @@ std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph,
   const auto& shape_dict = graph->GetAttrs<
       absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
 
-  std::unique_ptr<hlir::framework::OpLowerer> op_lowerer =
-      std::make_unique<hlir::framework::OpLowerer>(
-          dtype_dict, shape_dict, target);
+  auto op_lowerer =
+      hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
   for (TuneTask& task : tasks) {
-    task.Initialize(shape_dict, dtype_dict, op_lowerer.get());
+    task.Initialize(shape_dict, dtype_dict, &op_lowerer);
     VLOG(3) << "Add a task with serialized_key:\n" << task.serialized_key;
   }
 

diff --git a/paddle/cinn/auto_schedule/task/tune_task.cc b/paddle/cinn/auto_schedule/task/tune_task.cc
@@ -34,7 +34,7 @@ void TuneTask::Initialize(
     const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
         shape_dict,
     const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
-    hlir::framework::OpLowerer* lower_handler) {
+    hlir::framework::OpLowerer<GroupPtr>* lower_handler) {
   CHECK(lower_handler != nullptr) << "op_lowerer can't be nullptr";
   op_lowerer = lower_handler;
 

diff --git a/paddle/cinn/auto_schedule/task/tune_task.h b/paddle/cinn/auto_schedule/task/tune_task.h
@@ -34,24 +34,25 @@ namespace cinn {
 namespace auto_schedule {
 
 class TuneTask {
+  using GroupPtr = hlir::framework::GroupPtr;
+
  public:
   TuneTask() = default;
-  explicit TuneTask(std::shared_ptr<hlir::framework::Graph::Group> group)
-      : subgraph(group) {}
+  explicit TuneTask(GroupPtr group) : subgraph(group) {}
   // Initialize a task
   void Initialize(
       const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
           shape_dict,
       const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
-      hlir::framework::OpLowerer* lower_handler);
+      hlir::framework::OpLowerer<GroupPtr>* lower_handler);
   // Extract bodies in lowered_funcs() and return
   std::vector<ir::Expr> GetLoweredFuncBodyExprs() const;
 
   // In CINN, we use hlir::framework::Graph::Group to represent a fused
   // sub-graph (if an op won't be fused, it will be a Group with size=1).
   std::shared_ptr<hlir::framework::Graph::Group> subgraph;
   // Lower handler, Not owned
-  hlir::framework::OpLowerer* op_lowerer;
+  hlir::framework::OpLowerer<GroupPtr>* op_lowerer;
   // target of this task
   common::Target target;
   // stores the initial (un-optimized) LoweredFuncs

diff --git a/paddle/cinn/auto_schedule/task/tune_task_test.cc b/paddle/cinn/auto_schedule/task/tune_task_test.cc
@@ -75,7 +75,8 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
   const auto& dtype_dict =
       graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
           "inferdtype");
-  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+  auto op_lowerer =
+      hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
 
   std::stringstream ss;
   for (TuneTask& task : tasks) {
@@ -187,7 +188,8 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
       graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
           "inferdtype");
 
-  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+  OpLowerer op_lowerer(
+      new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target));
 
   std::stringstream ss;
   for (TuneTask& task : tasks) {
@@ -291,7 +293,8 @@ TEST(TuneTask, SerializeToString) {
   const auto& dtype_dict =
       graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
           "inferdtype");
-  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+  OpLowerer op_lowerer(
+      new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target));
   ASSERT_EQ(single_tasks.size(), 2UL);
   for (auto&& task : single_tasks) {
     task.Initialize(shape_dict, dtype_dict, &op_lowerer);

diff --git a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
@@ -27,6 +27,7 @@
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
 #include "paddle/cinn/hlir/framework/graph_compiler_util.h"
 #include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/runtime/flags.h"
@@ -143,9 +144,8 @@ class PerformanceTester : public ::testing::Test {
         absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
         "infershape");
 
-    std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
-        std::make_unique<hlir::framework::OpLowerer>(
-            dtype_dict, shape_dict, target_);
+    auto op_lowerer =
+        hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target_);
 
     CompilationContext& context = graph_compiler->GetCompilationContext();
     context.with_instantiate_variables = true;
@@ -157,9 +157,9 @@ class PerformanceTester : public ::testing::Test {
 
     for (auto group : graph->fusion_groups) {
       context.lowered_funcs.push_back(
-          op_lowerer->Lower(group,
-                            /*apply_op_schedule = */ false,
-                            /*apply_group_schedule=*/false));
+          op_lowerer.Lower(group,
+                           /*apply_op_schedule = */ false,
+                           /*apply_group_schedule=*/false));
     }
 
     VLOG(3) << "===========================No Schedule LoweredFunc "

diff --git a/paddle/cinn/backends/compiler.cc b/paddle/cinn/backends/compiler.cc
@@ -19,6 +19,7 @@
 #include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/hlir/framework/visualize_helper.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
 #ifdef CINN_WITH_CUDA
 #include "paddle/cinn/backends/codegen_cuda_dev.h"
 #include "paddle/cinn/backends/codegen_cuda_host.h"