Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… dyn_cast_interface
PaddlePaddle · Sep 26, 2023 · cb8456a · cb8456a
2 parents 307b372 + 7282acf
commit cb8456a
Show file tree

Hide file tree

Showing 1,184 changed files with 32,060 additions and 16,110 deletions.
diff --git a/.clang-tidy b/.clang-tidy
@@ -4,7 +4,7 @@ bugprone-argument-comment,
 -bugprone-assert-side-effect,
 -bugprone-bad-signal-to-kill-thread,
 -bugprone-bool-pointer-implicit-conversion,
--bugprone-branch-clone,
+bugprone-branch-clone,
 bugprone-copy-constructor-init,
 -bugprone-dangling-handle,
 -bugprone-dynamic-static-initializers,
@@ -75,7 +75,7 @@ clang-analyzer-cplusplus.InnerPointer,
 -clang-analyzer-cplusplus.SelfAssignment,
 -clang-analyzer-cplusplus.SmartPtr,
 -clang-analyzer-cplusplus.VirtualCallModeling,
--clang-analyzer-deadcode.DeadStores,
+clang-analyzer-deadcode.DeadStores,
 -clang-analyzer-fuchsia.HandleChecker,
 -clang-analyzer-nullability.NullPassedToNonnull,
 -clang-analyzer-nullability.NullReturnedFromNonnull,

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -94,9 +94,9 @@ repos:
         description: Check C++ code style using cpplint.py.
         entry: bash ./tools/codestyle/cpplint_pre_commit.hook
         language: system
-        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$
+        files: \.(cc|cxx|cpp|cu|h|hpp|hxx)$
         args:
-            - --extensions=c,cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps
+            - --extensions=cc,cxx,cpp,cu,cuh,h,hpp,hxx,kps
             - --filter=-readability/fn_size,-build/include_what_you_use,-build/c++11,-whitespace/parens
             - --quiet
         # Exclude third-party libraries

diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
@@ -37,7 +37,7 @@ if(WITH_GPU)
     file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/gloo/device.cc.patch
          native_dst)
     set(GLOO_PATCH_COMMAND
-        git checkout -- . && git checkout ${GLOO_TAG} &&patch -Nd
+        git checkout -- . && git checkout ${GLOO_TAG} && patch -Nd
         ${SOURCE_DIR}/gloo/transport/tcp < ${native_dst})
   endif()
 endif()

diff --git a/cmake/hip.cmake b/cmake/hip.cmake
@@ -85,8 +85,11 @@ find_package_and_include(rocsparse)
 find_package_and_include(rocfft)
 
 # set CXX flags for HIP
-set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
+set(CMAKE_C_FLAGS
+    "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__ -DROCM_NO_WRAPPER_HEADER_WARNING")
+set(CMAKE_CXX_FLAGS
+    "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__ -DROCM_NO_WRAPPER_HEADER_WARNING"
+)
 set(CMAKE_CXX_FLAGS
     "${CMAKE_CXX_FLAGS} -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP")
 set(THRUST_DEVICE_SYSTEM THRUST_DEVICE_SYSTEM_HIP)
@@ -96,6 +99,7 @@ list(APPEND HIP_CXX_FLAGS -fPIC)
 list(APPEND HIP_CXX_FLAGS -D__HIP_PLATFORM_HCC__=1)
 # Note(qili93): HIP has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
 list(APPEND HIP_CXX_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
+list(APPEND HIP_CXX_FLAGS -DROCM_NO_WRAPPER_HEADER_WARNING)
 list(APPEND HIP_CXX_FLAGS -Wno-macro-redefined)
 list(APPEND HIP_CXX_FLAGS -Wno-inconsistent-missing-override)
 list(APPEND HIP_CXX_FLAGS -Wno-exceptions)

diff --git a/paddle/cinn/ast_gen_ius/ast_gen.cc b/paddle/cinn/ast_gen_ius/ast_gen.cc
@@ -18,34 +18,114 @@
 #include "paddle/cinn/ir/operation.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/lang/compute.h"
 
 namespace cinn {
 namespace ast_gen_ius {
 
-ir::Expr AstGen::Build(const ir::Tensor& tensor) {
+ir::Expr ConvertReduceBody(ir::Expr body,
+                           ir::Tensor tensor,
+                           const std::vector<Expr>& axis_exprs) {
+  ir::Reduce* reduce_node = body.As<ir::Reduce>();
+  if (!reduce_node) {
+    return ir::Store::Make(tensor, body, axis_exprs);
+  }
+
+  switch (reduce_node->reduce_type) {
+    case ir::Reduce::kSum:
+      return ir::Store::Make(
+          tensor, tensor(axis_exprs) + reduce_node->body, axis_exprs);
+    case ir::Reduce::kMul:
+      return ir::Store::Make(
+          tensor, tensor(axis_exprs) * reduce_node->body, axis_exprs);
+    case ir::Reduce::kMax:
+      return ir::Store::Make(
+          tensor,
+          ir::Max::Make(tensor(axis_exprs), reduce_node->body),
+          axis_exprs);
+    case ir::Reduce::kMin:
+      return ir::Store::Make(
+          tensor,
+          ir::Min::Make(tensor(axis_exprs), reduce_node->body),
+          axis_exprs);
+    case ir::Reduce::kAll:
+      return ir::Store::Make(
+          tensor, tensor(axis_exprs) && reduce_node->body, axis_exprs);
+    case ir::Reduce::kAny:
+      return ir::Store::Make(
+          tensor, tensor(axis_exprs) || reduce_node->body, axis_exprs);
+    default:
+      CINN_NOT_IMPLEMENTED
+  }
+}
+
+ir::Expr AstGen::Build(const ir::Tensor& tensor, TensorGroup* tensor_group) {
   const std::vector<ir::Var>& axis = tensor->axis();
   const std::vector<ir::Expr>& shape = tensor->shape;
   size_t axis_len = axis.size();
-  CHECK_EQ(shape.size(), axis_len)
-      << "Internal Error: Tensor has different shape and axis length in AstGen";
-
+  CHECK_EQ(shape.size(), axis_len) << "Internal Error: Tensor has different "
+                                      "shape and axis length in AstGen";
   std::vector<ir::Expr> axis_exprs;
   for (const auto& a : axis) {
     axis_exprs.push_back(a);
   }
-  ir::Expr body = ir::Store::Make(tensor, tensor->body(), axis_exprs);
-
-  for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
-    ir::Var loop_var = axis[i];
-    ir::Expr loop_extent = shape[i];
-    body = ir::For::Make(loop_var,
-                         Expr(0),
-                         loop_extent,
-                         ir::ForType::Serial,
-                         ir::DeviceAPI::Host,
-                         ir::Block::Make({body}));
+
+  if (tensor->is_reduce_tensor()) {
+    // Make an init Tensor for domain without reduce axis
+    Expr init_value = tensor->GetReduceInitVal();
+    // TODO(zhhsplendid): Clean the handcoded "__reduce_init" string
+    std::string reduce_init_name = tensor->name + "__reduce_init";
+    const std::vector<Expr>& domain = tensor->domain_without_reduce_axis();
+    ir::Tensor init_tensor = lang::Compute(
+        domain,
+        [=](const std::vector<Expr>& axis) { return init_value; },
+        reduce_init_name);
+    tensor_group->Insert(init_tensor);
+    tensor_group->MarkShareMemBuffer(tensor, init_tensor);
+    tensor_group->CtrlDepend(tensor, init_tensor);
+    Expr init_body = ir::Store::Make(init_tensor, init_value, axis_exprs);
+
+    // For the remaining reduce axis, make reduce body
+    const std::vector<ir::Var>& reduce_axis = tensor->reduce_axis;
+    ir::Expr reduce_body =
+        ConvertReduceBody(tensor->body(), tensor, axis_exprs);
+    for (int i = static_cast<int>(reduce_axis.size()) - 1; i >= 0; --i) {
+      reduce_body = ir::For::Make(reduce_axis[i],
+                                  reduce_axis[i]->lower_bound,
+                                  reduce_axis[i]->upper_bound,
+                                  ir::ForType::Serial,
+                                  ir::DeviceAPI::Host,
+                                  ir::Block::Make({reduce_body}));
+    }
+
+    // Put the two parts together
+    ir::Expr body = ir::Block::Make({init_body, reduce_body});
+    for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
+      ir::Var loop_var = axis[i];
+      ir::Expr loop_extent = shape[i];
+      body = ir::For::Make(
+          loop_var,
+          Expr(0),
+          loop_extent,
+          ir::ForType::Serial,
+          ir::DeviceAPI::Host,
+          i == static_cast<int>(axis_len) - 1 ? body : ir::Block::Make({body}));
+    }
+    return body;
+  } else {
+    ir::Expr body = ir::Store::Make(tensor, tensor->body(), axis_exprs);
+    for (int i = static_cast<int>(axis_len) - 1; i >= 0; --i) {
+      ir::Var loop_var = axis[i];
+      ir::Expr loop_extent = shape[i];
+      body = ir::For::Make(loop_var,
+                           Expr(0),
+                           loop_extent,
+                           ir::ForType::Serial,
+                           ir::DeviceAPI::Host,
+                           ir::Block::Make({body}));
+    }
+    return body;
   }
-  return body;
 }
 
 }  // namespace ast_gen_ius

diff --git a/paddle/cinn/ast_gen_ius/ast_gen.h b/paddle/cinn/ast_gen_ius/ast_gen.h
@@ -23,7 +23,7 @@ namespace ast_gen_ius {
 
 class AstGen {
  public:
-  static ir::Expr Build(const ir::Tensor& tensor);
+  static ir::Expr Build(const ir::Tensor& tensor, TensorGroup* tensor_group);
 };
 
 }  // namespace ast_gen_ius

diff --git a/paddle/cinn/ast_gen_ius/ast_gen_test.cc b/paddle/cinn/ast_gen_ius/ast_gen_test.cc
@@ -16,6 +16,7 @@
 #include <vector>
 
 #include "paddle/cinn/ast_gen_ius/ast_gen.h"
+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/ir/tensor.h"
@@ -36,7 +37,8 @@ TEST(AstGen, Build) {
       shape,
       [&](const std::vector<Expr>& indice) { return lang::Relu(A(indice), 0); },
       "relu_test");
-  Expr out = AstGen::Build(B);
+  TensorGroup tensor_group({B});
+  Expr out = AstGen::Build(B, &tensor_group);
   LOG(INFO) << out;
 }
 

diff --git a/paddle/cinn/ast_gen_ius/tensor_group.cc b/paddle/cinn/ast_gen_ius/tensor_group.cc
@@ -30,7 +30,7 @@ TensorGroup::TensorGroup(const std::vector<ir::Tensor>& tensors) {
 
   for (auto& tensor : tensors) {
     output_tensor_names_.insert(tensor->name);
-    std::set<ir::Expr> used_tensors = ir::CollectIRNodes(
+    std::set<ir::Expr> used_tensors = ir::ir_utils::CollectIRNodes(
         tensor->body(), [](const Expr* x) { return x->as_tensor(); });
     for (const Expr& x : used_tensors) {
       const ir::Tensor to_dep = x.as_tensor_ref();
@@ -75,10 +75,12 @@ std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
   }
 
   std::vector<ir::Tensor> ret;
-  std::vector<std::string> stack;
+
+  // Using set instead of vector/stack in order to get fix alaphbeta order topo
+  std::set<std::string> node_set;
   for (const auto& name_tensor : name_to_tensor_) {
     if (!in_degree.count(name_tensor.first)) {
-      stack.emplace_back(name_tensor.first);
+      node_set.insert(name_tensor.first);
     }
   }
 
@@ -90,9 +92,9 @@ std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
     input_arg_names.erase(name);
   }
 
-  while (!stack.empty()) {
-    const std::string& cur = stack.back();
-    stack.pop_back();
+  while (!node_set.empty()) {
+    const std::string cur = *(node_set.begin());
+    node_set.erase(node_set.begin());
 
     if (!input_arg_names.count(cur)) {
       ret.push_back(name_to_tensor_[cur]);
@@ -103,23 +105,14 @@ std::vector<ir::Tensor> TensorGroup::GetGenFuncTopoOrder(
       if (dep_tensor_names.count(cur)) {
         --in_degree[dep_pair.first];
         if (in_degree[dep_pair.first] == 0) {
-          stack.emplace_back(dep_pair.first);
+          node_set.insert(dep_pair.first);
         }
       }
     }
   }
   return ret;
 }
 
-bool TensorGroup::HasMarkedReduceInit(const std::string& tensor_name) const {
-  return tensor_name_needs_reduce_init_.count(tensor_name);
-}
-
-ir::Tensor TensorGroup::MarkReduceInit(const std::string& tensor_name) {
-  // TODO(zhhsplendid): add check
-  tensor_name_needs_reduce_init_.insert(tensor_name);
-}
-
 void TensorGroup::CtrlDepend(const ir::Tensor& tensor,
                              const ir::Tensor& to_dep) {
   ctrl_dep_[tensor->name].insert(to_dep->name);
@@ -156,8 +149,8 @@ std::string TensorGroup::GetShareMemRootName(const std::string& tensor_name) {
   return share_memory_tensor_[tensor_name];
 }
 
-void TensorGroup::ShareMemoryBuffer(const ir::Tensor& tensor,
-                                    const ir::Tensor& to_share) {
+void TensorGroup::MarkShareMemBuffer(const ir::Tensor& tensor,
+                                     const ir::Tensor& to_share) {
   share_memory_tensor_[GetShareMemRootName(to_share->name)] =
       GetShareMemRootName(tensor->name);
 }