Merge branch 'develop' into remove_mkldnn_code

PaddlePaddle · Dec 14, 2022 · 8603d90 · 8603d90
2 parents 1006383 + b920705
commit 8603d90
Show file tree

Hide file tree

Showing 533 changed files with 9,828 additions and 12,976 deletions.
diff --git a/.github/ISSUE_TEMPLATE/3_build-installation-issue.yml b/.github/ISSUE_TEMPLATE/3_build-installation-issue.yml
@@ -49,11 +49,16 @@ body:
       Paddle With CUDA:
 
       OS:
+      GCC version:
+      Clang version:
+      CMake version:
+      Libc version:
       Python version:
 
       CUDA version:
       cuDNN version:
       Nvidia driver version:
+      Nvidia driver List:
       ****************************************
   validations:
     required: true

diff --git a/.gitignore b/.gitignore
@@ -55,6 +55,9 @@ Testing
 tools/__pycache__
 tools/nvcc_lazy
 
+# Ignore files generated from 'python setup.py develop'
+@PADDLE_BINARY_DIR@
+
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
 paddle/infrt/dialect/pd/ir/pd_ops.td
@@ -73,8 +76,8 @@ tools/nvcc_lazy
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op.cc
 paddle/fluid/operators/generated_sparse_op.cc
-paddle/phi/ops/compat/generated_sig.cc
-paddle/phi/ops/compat/generated_sparse_sig.cc
+paddle/fluid/operators/generated_static_op.cc
+paddle/phi/ops/compat/generated_*.cc
 paddle/phi/api/yaml/parsed_apis/
 paddle/fluid/operators/generator/parsed_ops/
 paddle/fluid/pybind/tmp_eager_op_function_impl.h

diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
@@ -29,11 +29,13 @@ list(
   ${CUDNN_ROOT}
   ${CUDNN_ROOT}/lib64
   ${CUDNN_ROOT}/lib
+  ${CUDNN_ROOT}/lib/x64
   ${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
   ${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
   $ENV{CUDNN_ROOT}
   $ENV{CUDNN_ROOT}/lib64
   $ENV{CUDNN_ROOT}/lib
+  $ENV{CUDNN_ROOT}/lib/x64
   /usr/lib
   ${CUDA_TOOLKIT_ROOT_DIR}
   ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
@@ -220,7 +220,8 @@ if(APPLE)
       -Werror=uninitialized
       -Werror=tautological-constant-out-of-range-compare
       -Werror=literal-conversion
-      -Werror=pragma-pack)
+      -Werror=pragma-pack
+      -Werror=c++17-extensions)
 endif()
 
 if(WITH_HETERPS AND WITH_PSLIB)

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
@@ -26,6 +26,35 @@ function(find_register FILENAME PATTERN OUTPUT)
       PARENT_SCOPE)
 endfunction()
 
+function(find_phi_register FILENAME ADD_PATH)
+  # set op_name to OUTPUT
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs "")
+  file(READ ${FILENAME} CONTENT)
+
+  string(
+    REGEX
+      MATCH
+      "PD_REGISTER_KERNEL\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
+      register
+      "${CONTENT}")
+  if(NOT register STREQUAL "")
+    string(REPLACE "PD_REGISTER_KERNEL(" "" register "${register}")
+    string(REPLACE "," ";" register "${register}")
+    string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}")
+    string(REGEX REPLACE "//cuda_only" "" register "${register}")
+    list(GET register 0 kernel_name)
+    list(GET register 1 kernel_backend)
+    list(GET register 2 kernel_layout)
+
+    file(
+      APPEND ${ADD_PATH}
+      "PD_DECLARE_KERNEL(${kernel_name}, ${kernel_backend}, ${kernel_layout});\n"
+    )
+  endif()
+endfunction()
+
 function(op_library TARGET)
   # op_library is a function to create op library. The interface is same as
   # cc_library. But it handle split GPU/CPU code and link some common library
@@ -371,6 +400,8 @@ function(op_library TARGET)
   foreach(cc_src ${cc_srcs})
     # pybind USE_OP_ITSELF
     set(op_name "")
+    # Add PHI Kernel Registry Message
+    find_phi_register(${cc_src} ${pybind_file})
     find_register(${cc_src} "REGISTER_OPERATOR" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
@@ -408,6 +439,8 @@ function(op_library TARGET)
   # message("cu_srcs ${cu_srcs}")
   foreach(cu_src ${cu_srcs})
     set(op_name "")
+    # Add PHI Kernel Registry Message
+    find_phi_register(${cu_src} ${pybind_file})
     find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
     if(NOT ${op_name} EQUAL "")
       file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")

diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
@@ -317,8 +317,9 @@ if(WITH_ONNXRUNTIME)
 endif()
 
 if(WITH_GPU)
-  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0 OR ${CMAKE_CUDA_COMPILER_VERSION}
-                                                 GREATER_EQUAL 11.6)
+  if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0
+     OR (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6
+         AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.8))
     include(external/cub) # download cub
     list(APPEND third_party_deps extern_cub)
   endif()

diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -21,7 +21,7 @@ endif()
 if(WITH_NCCL OR WITH_RCCL)
   cc_library(
     processgroup_nccl
-    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc static_check.cc
+    SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc
     DEPS processgroup
          processgroup_stream
          place

diff --git a/paddle/fluid/distributed/collective/NCCLTools.cc b/paddle/fluid/distributed/collective/NCCLTools.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/distributed/collective/NCCLTools.h"
 
-#include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
 namespace distributed {

diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h
@@ -21,42 +21,29 @@
 #include <hip/hip_runtime.h>
 #endif
 
-#include <error.h>
-
 #include <string>
 
 #include "paddle/fluid/distributed/collective/Types.h"
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/variable.h"
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include "paddle/fluid/platform/cuda_device_guard.h"
-#endif
-
-#include "paddle/fluid/platform/device_context.h"
 
 #ifdef PADDLE_WITH_RCCL
-#include "paddle/fluid/platform/dynload/rccl.h"
+#include "paddle/phi/backends/dynload/rccl.h"
 #else
-#include "paddle/fluid/platform/dynload/nccl.h"
+#include "paddle/phi/backends/dynload/nccl.h"
 #endif
 
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/utils/variant.h"
-
 namespace paddle {
 namespace distributed {
 
-#define NCCL_CHECK(cmd)                                 \
-  do {                                                  \
-    ncclResult_t r = cmd;                               \
-    if (r != ncclSuccess) {                             \
-      printf("Failed, NCCL error %s:%d '%s'\n",         \
-             __FILE__,                                  \
-             __LINE__,                                  \
-             platform::dynload::ncclGetErrorString(r)); \
-      exit(EXIT_FAILURE);                               \
-    }                                                   \
+#define NCCL_CHECK(cmd)                            \
+  do {                                             \
+    ncclResult_t r = cmd;                          \
+    if (r != ncclSuccess) {                        \
+      printf("Failed, NCCL error %s:%d '%s'\n",    \
+             __FILE__,                             \
+             __LINE__,                             \
+             phi::dynload::ncclGetErrorString(r)); \
+      exit(EXIT_FAILURE);                          \
+    }                                              \
   } while (0)
 
 ncclRedOp_t ToNCCLRedType(ReduceOp reduction);

diff --git a/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc b/paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
@@ -154,7 +154,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
   const auto& place = in_tensor.place();
   const auto& key = GetKeyFromPlace(place);
 
-  if (!calc_event_) {
+  if (!calc_event_ ||
+      (place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end())) {
     CreateBKCLEnvCache(place, key);
   }
 
@@ -170,6 +171,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
   fn(out_tensor, in_tensor, comm_ctx->bkcl_context(), bkcl_stream);
 
   if (!use_calc_stream) {
+    PADDLE_ENFORCE_NOT_NULL(
+        comm_ctx.get(), platform::errors::Fatal("comm context is nullptr."));
     task->comm_event_->Record(*comm_ctx.get());
   }
 
@@ -369,6 +372,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
   return Collective(
       &out_tensors[0],
       in_tensors[0],
@@ -406,6 +413,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
   return Collective(
       &out_tensors[0],
       in_tensors[0],
@@ -442,6 +453,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
 
   return Collective(
       &out_tensors[0],
@@ -481,6 +496,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
 
   return Collective(
       &out_tensors[0],
@@ -518,6 +537,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
       1,
       platform::errors::InvalidArgument(
           "BKCL only support single tensor collective communication."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInXPUPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
   PADDLE_ENFORCE_EQ(
       CheckTensorsInXPUPlace(out_tensors),
       true,