Skip to content

Commit

Permalink
Merge branch 'develop' into remove_mkldnn_code
Browse files Browse the repository at this point in the history
  • Loading branch information
Silv3S committed Dec 14, 2022
2 parents 1006383 + b920705 commit 8603d90
Show file tree
Hide file tree
Showing 533 changed files with 9,828 additions and 12,976 deletions.
5 changes: 5 additions & 0 deletions .github/ISSUE_TEMPLATE/3_build-installation-issue.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,16 @@ body:
Paddle With CUDA:
OS:
GCC version:
Clang version:
CMake version:
Libc version:
Python version:
CUDA version:
cuDNN version:
Nvidia driver version:
Nvidia driver List:
****************************************
validations:
required: true
Expand Down
7 changes: 5 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ Testing
tools/__pycache__
tools/nvcc_lazy

# Ignore files generated from 'python setup.py develop'
@PADDLE_BINARY_DIR@

# This file is automatically generated.
# TODO(zhiqiang) Move this file to build directory.
paddle/infrt/dialect/pd/ir/pd_ops.td
Expand All @@ -73,8 +76,8 @@ tools/nvcc_lazy
# these files (directories) are generated before build system generation
paddle/fluid/operators/generated_op.cc
paddle/fluid/operators/generated_sparse_op.cc
paddle/phi/ops/compat/generated_sig.cc
paddle/phi/ops/compat/generated_sparse_sig.cc
paddle/fluid/operators/generated_static_op.cc
paddle/phi/ops/compat/generated_*.cc
paddle/phi/api/yaml/parsed_apis/
paddle/fluid/operators/generator/parsed_ops/
paddle/fluid/pybind/tmp_eager_op_function_impl.h
Expand Down
2 changes: 2 additions & 0 deletions cmake/cudnn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,13 @@ list(
${CUDNN_ROOT}
${CUDNN_ROOT}/lib64
${CUDNN_ROOT}/lib
${CUDNN_ROOT}/lib/x64
${CUDNN_ROOT}/lib/${TARGET_ARCH}-linux-gnu
${CUDNN_ROOT}/local/cuda-${CUDA_VERSION}/targets/${TARGET_ARCH}-linux/lib/
$ENV{CUDNN_ROOT}
$ENV{CUDNN_ROOT}/lib64
$ENV{CUDNN_ROOT}/lib
$ENV{CUDNN_ROOT}/lib/x64
/usr/lib
${CUDA_TOOLKIT_ROOT_DIR}
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
Expand Down
3 changes: 2 additions & 1 deletion cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,8 @@ if(APPLE)
-Werror=uninitialized
-Werror=tautological-constant-out-of-range-compare
-Werror=literal-conversion
-Werror=pragma-pack)
-Werror=pragma-pack
-Werror=c++17-extensions)
endif()

if(WITH_HETERPS AND WITH_PSLIB)
Expand Down
33 changes: 33 additions & 0 deletions cmake/operators.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,35 @@ function(find_register FILENAME PATTERN OUTPUT)
PARENT_SCOPE)
endfunction()

function(find_phi_register FILENAME ADD_PATH)
# set op_name to OUTPUT
set(options "")
set(oneValueArgs "")
set(multiValueArgs "")
file(READ ${FILENAME} CONTENT)

string(
REGEX
MATCH
"PD_REGISTER_KERNEL\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
register
"${CONTENT}")
if(NOT register STREQUAL "")
string(REPLACE "PD_REGISTER_KERNEL(" "" register "${register}")
string(REPLACE "," ";" register "${register}")
string(REGEX REPLACE "[ \\\t\r\n]+" "" register "${register}")
string(REGEX REPLACE "//cuda_only" "" register "${register}")
list(GET register 0 kernel_name)
list(GET register 1 kernel_backend)
list(GET register 2 kernel_layout)

file(
APPEND ${ADD_PATH}
"PD_DECLARE_KERNEL(${kernel_name}, ${kernel_backend}, ${kernel_layout});\n"
)
endif()
endfunction()

function(op_library TARGET)
# op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library
Expand Down Expand Up @@ -371,6 +400,8 @@ function(op_library TARGET)
foreach(cc_src ${cc_srcs})
# pybind USE_OP_ITSELF
set(op_name "")
# Add PHI Kernel Registry Message
find_phi_register(${cc_src} ${pybind_file})
find_register(${cc_src} "REGISTER_OPERATOR" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
Expand Down Expand Up @@ -408,6 +439,8 @@ function(op_library TARGET)
# message("cu_srcs ${cu_srcs}")
foreach(cu_src ${cu_srcs})
set(op_name "")
# Add PHI Kernel Registry Message
find_phi_register(${cu_src} ${pybind_file})
find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
Expand Down
5 changes: 3 additions & 2 deletions cmake/third_party.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -317,8 +317,9 @@ if(WITH_ONNXRUNTIME)
endif()

if(WITH_GPU)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0 OR ${CMAKE_CUDA_COMPILER_VERSION}
GREATER_EQUAL 11.6)
if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0
OR (${CMAKE_CUDA_COMPILER_VERSION} GREATER_EQUAL 11.6
AND ${CMAKE_CUDA_COMPILER_VERSION} LESS 11.8))
include(external/cub) # download cub
list(APPEND third_party_deps extern_cub)
endif()
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/collective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ endif()
if(WITH_NCCL OR WITH_RCCL)
cc_library(
processgroup_nccl
SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc static_check.cc
SRCS ProcessGroupNCCL.cc NCCLTools.cc Common.cc check.cc
DEPS processgroup
processgroup_stream
place
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/collective/NCCLTools.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

#include "paddle/fluid/distributed/collective/NCCLTools.h"

#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace distributed {
Expand Down
37 changes: 12 additions & 25 deletions paddle/fluid/distributed/collective/NCCLTools.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,42 +21,29 @@
#include <hip/hip_runtime.h>
#endif

#include <error.h>

#include <string>

#include "paddle/fluid/distributed/collective/Types.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/variable.h"

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif

#include "paddle/fluid/platform/device_context.h"

#ifdef PADDLE_WITH_RCCL
#include "paddle/fluid/platform/dynload/rccl.h"
#include "paddle/phi/backends/dynload/rccl.h"
#else
#include "paddle/fluid/platform/dynload/nccl.h"
#include "paddle/phi/backends/dynload/nccl.h"
#endif

#include "paddle/fluid/platform/enforce.h"
#include "paddle/utils/variant.h"

namespace paddle {
namespace distributed {

#define NCCL_CHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__, \
__LINE__, \
platform::dynload::ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
#define NCCL_CHECK(cmd) \
do { \
ncclResult_t r = cmd; \
if (r != ncclSuccess) { \
printf("Failed, NCCL error %s:%d '%s'\n", \
__FILE__, \
__LINE__, \
phi::dynload::ncclGetErrorString(r)); \
exit(EXIT_FAILURE); \
} \
} while (0)

ncclRedOp_t ToNCCLRedType(ReduceOp reduction);
Expand Down
25 changes: 24 additions & 1 deletion paddle/fluid/distributed/collective/ProcessGroupBKCL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
const auto& place = in_tensor.place();
const auto& key = GetKeyFromPlace(place);

if (!calc_event_) {
if (!calc_event_ ||
(place_to_comm_ctx_.find(key) == place_to_comm_ctx_.end())) {
CreateBKCLEnvCache(place, key);
}

Expand All @@ -170,6 +171,8 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Collective(
fn(out_tensor, in_tensor, comm_ctx->bkcl_context(), bkcl_stream);

if (!use_calc_stream) {
PADDLE_ENFORCE_NOT_NULL(
comm_ctx.get(), platform::errors::Fatal("comm context is nullptr."));
task->comm_event_->Record(*comm_ctx.get());
}

Expand Down Expand Up @@ -369,6 +372,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
return Collective(
&out_tensors[0],
in_tensors[0],
Expand Down Expand Up @@ -406,6 +413,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
return Collective(
&out_tensors[0],
in_tensors[0],
Expand Down Expand Up @@ -442,6 +453,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));

return Collective(
&out_tensors[0],
Expand Down Expand Up @@ -481,6 +496,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));

return Collective(
&out_tensors[0],
Expand Down Expand Up @@ -518,6 +537,10 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
1,
platform::errors::InvalidArgument(
"BKCL only support single tensor collective communication."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(in_tensors),
true,
platform::errors::InvalidArgument("All inputs should be in XPUPlace."));
PADDLE_ENFORCE_EQ(
CheckTensorsInXPUPlace(out_tensors),
true,
Expand Down
Loading

0 comments on commit 8603d90

Please sign in to comment.