Skip to content

Commit

Permalink
[pten] remove deprecated fluid op kernel for pten (#38842)
Browse files Browse the repository at this point in the history
* update cmake file to remove fluid kernel

* add pten declaration.h to where pybind.h used

* fix sync_bn and tensorrt_engine

* refine detection_library

* fix interpreter_core

* support eager legacy

* fit eager legacy for pten

* fall back to cpu if not found kernel

* fix compile problem

* fix compile problem

* refine fallback logic

* fit operator.run()

* fix xpu compile

* fit for new_exec

* add REGISTER_OP_WITHOUT_GRADIENT

* un-cache pt_kernel_context

* fix compile

* fix cudnn

* fix compiling with on_infer

* fix mkldnn

* fix isfinite_v2

* fix xpu problem

* fix op_device

* refine fallback for xpu

* fix xpu compile

* merge develop

* refine code format

* fix compile

* fix compile

* add data_transfer

* fix PreparePtenData

* fix cpu context

* merge develop

* fix compile

* fix error device context

* fix xpu

* fix dev_ctx
  • Loading branch information
zhiqiu authored Jan 26, 2022
1 parent 801159c commit 3ab9aef
Show file tree
Hide file tree
Showing 39 changed files with 945 additions and 570 deletions.
262 changes: 151 additions & 111 deletions cmake/operators.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,29 @@
# CMake file `unity_build` is used to handle Unity Build compilation.
include(unity_build)
set(PART_CUDA_KERNEL_FILES)

function(find_register FILENAME PATTERN OUTPUT)
# find the op_name of REGISTER_OPERATOR(op_name, ...), REGISTER_OP_CPU_KERNEL(op_name, ...) , etc.
# set op_name to OUTPUT
set(options "")
set(oneValueArgs "")
set(multiValueArgs "")
file(READ ${FILENAME} CONTENT)
# message ("number of arguments sent to function: ${ARGC}")
# message ("all function arguments: ${ARGV}")
# message("PATTERN ${PATTERN}")
string(REGEX MATCH "${PATTERN}\\([ \t\r\n]*[a-z0-9_]*," register "${CONTENT}")
if (NOT register STREQUAL "")
string(REPLACE "${PATTERN}(" "" register "${register}")
string(REPLACE "," "" register "${register}")
# [ \t\r\n]+ is used for blank characters.
# Here we use '+' instead of '*' since it is a REPLACE operation.
string(REGEX REPLACE "[ \t\r\n]+" "" register "${register}")
endif()

set(${OUTPUT} ${register} PARENT_SCOPE)
endfunction()

function(op_library TARGET)
# op_library is a function to create op library. The interface is same as
# cc_library. But it handle split GPU/CPU code and link some common library
Expand Down Expand Up @@ -119,16 +142,16 @@ function(op_library TARGET)
list(APPEND miopen_cu_cc_srcs ${src})
elseif(WITH_ROCM AND ${src} MATCHES ".*\\.cu.cc$")
list(APPEND hip_cc_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu$")
elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu$")
list(APPEND cudnn_cu_srcs ${src})
elseif (${src} MATCHES ".*\\.cu$")
elseif (WITH_GPU AND ${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
elseif(WITH_GPU AND ${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_GPU AND ${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_op_xpu.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(WITH_ASCEND_CL AND ${src} MATCHES ".*_op_npu.cc$")
Expand Down Expand Up @@ -228,135 +251,136 @@ function(op_library TARGET)
endif()
endif()

list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH hip_srcs hip_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH hip_cc_srcs hip_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
list(LENGTH npu_cc_srcs npu_cc_srcs_len)
list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)

# Define operators that don't need pybind here.
foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op"
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "sparse_attention_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
"fused_bn_add_activation_op" "fused_attention_op" "resnet_unit_op" "fused_feedforward_op")

if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
endforeach()
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op")

if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
endforeach()

# The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h.
set(ORIGINAL_TARGET ${TARGET})
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OPERATOR\\(.*REGISTER_OPERATOR\\(" multi_register "${TARGET_CONTENT}")
# [ \t\r\n]* is used for blank characters
string(REGEX MATCH "REGISTER_OPERATOR\\([ \t\r\n]*[a-z0-9_]*," one_register "${multi_register}")
string(REGEX REPLACE "_op" "" TARGET "${TARGET}")

if (one_register STREQUAL "")
string(REPLACE "_op" "" TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OPERATOR(" "" TARGET "${one_register}")
string(REPLACE "," "" TARGET "${TARGET}")
# [ \t\r\n]+ is used for blank characters.
# Here we use '+' instead of '*' since it is a REPLACE operation.
string(REGEX REPLACE "[ \t\r\n]+" "" TARGET "${TARGET}")
endif()
foreach(cc_src ${cc_srcs})
# pybind USE_OP_ITSELF
set(op_name "")
find_register(${cc_src} "REGISTER_OPERATOR" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
set(TARGET ${op_name})
set(pybind_flag 1)
endif()

set(op_name "")
find_register(${cc_src} "REGISTER_OP_WITHOUT_GRADIENT" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_ITSELF(${op_name});\n")
# hack: for example, the target in conv_transpose_op.cc is conv2d_transpose, used in mkldnn
set(TARGET ${op_name})
set(pybind_flag 1)
endif()

# pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
string(REPLACE "_op" "" TARGET "${TARGET}")
if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
# pybind USE_OP_DEVICE_KERNEL for CPU
set(op_name "")
find_register(${cc_src} "REGISTER_OP_CPU_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CPU);\n")
# why change TARGET here?
# when building padle with on_infer, the REGISTER_OPERATOR(*_grad) will be removed before compiling (see details in remove_grad_op_and_kernel.py)
# in elementwise_op.cc, it will find REGISTER_OPERATOR(grad_add) and set TARGET to grad_add
# and, in the following "mkldnn" part, it will add USE_OP_DEVICE_KERNEL(grad_add, MKLDNN) to pybind.h
# however, grad_add has no mkldnn kernel.
set(TARGET ${op_name})
set(pybind_flag 1)
endif()
endforeach()

# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH hip_srcs hip_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH hip_cc_srcs hip_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
list(LENGTH npu_cc_srcs npu_cc_srcs_len)
list(LENGTH mlu_cc_srcs mlu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_srcs_len} EQUAL 0 AND ${hip_cc_srcs_len} EQUAL 0 AND ${miopen_cu_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0 AND
${npu_cc_srcs_len} EQUAL 0 AND ${mlu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
# pybind USE_OP_DEVICE_KERNEL for CUDA
list (APPEND cu_srcs ${cu_cc_srcs})
# message("cu_srcs ${cu_srcs}")
foreach(cu_src ${cu_srcs})
set(op_name "")
find_register(${cu_src} "REGISTER_OP_CUDA_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDA);\n")
set(pybind_flag 1)
endif()
endforeach()

# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_cc_srcs cudnn_cu_cc_srcs_len)
if (WITH_GPU AND ${cudnn_cu_cc_srcs_len} GREATER 0)
if(${TARGET} STREQUAL "activation")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
endif()

# pybind USE_OP_DEVICE_KERNEL for MIOPEN
list(LENGTH miopen_cu_cc_srcs miopen_cu_cc_srcs_len)
if (WITH_ROCM AND ${miopen_cu_cc_srcs_len} GREATER 0)
if(${TARGET} STREQUAL "activation")
# pybind USE_OP_DEVICE_KERNEL for CUDNN/MIOPEN
list(APPEND cudnn_cu_srcs ${cudnn_cu_cc_srcs})
list(APPEND cudnn_cu_srcs ${miopen_cu_cc_srcs})
list(APPEND cudnn_cu_srcs ${miopen_cu_srcs})
list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
#message("cudnn_cu_srcs ${cudnn_cu_srcs}")
if(${cudnn_cu_srcs_len} GREATER 0 AND ${ORIGINAL_TARGET} STREQUAL "activation_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, CUDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
else()
foreach(cudnn_src ${cudnn_cu_srcs})
set(op_name "")
find_register(${cudnn_src} "REGISTER_OP_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, CUDNN);\n")
set(pybind_flag 1)
endif()
endforeach()
endif()

# pybind USE_OP_DEVICE_KERNEL for CUDNN
list(LENGTH cudnn_cu_srcs cudnn_cu_srcs_len)
if (WITH_GPU AND ${cudnn_cu_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()

# pybind USE_OP_DEVICE_KERNEL for MIOPEN
list(LENGTH miopen_cu_srcs miopen_cu_srcs_len)
if (WITH_ROCM AND ${miopen_cu_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
if(${ORIGINAL_TARGET} STREQUAL "activation_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, XPU);\n")
else()
foreach(xpu_src ${xpu_cc_srcs})
set(op_name "")
find_register(${xpu_src} "REGISTER_OP_XPU_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, XPU);\n")
set(pybind_flag 1)
endif()
endforeach()
endif()

if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()

# pybind USE_OP_DEVICE_KERNEL for NPU
if (WITH_ASCEND_CL AND ${npu_cc_srcs_len} GREATER 0)
file(READ ${ORIGINAL_TARGET}_npu.cc TARGET_NPU_CONTENT)
# It is different from the logic above, becareful
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\(.*" multi_npu_register "${TARGET_NPU_CONTENT}")
# [ \t\r\n]* is used for blank characters
string(REGEX MATCH "REGISTER_OP_NPU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_npu_register "${multi_npu_register}")

if (one_npu_register STREQUAL "")
string(REPLACE "_op" "" NPU_TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP_NPU_KERNEL(" "" NPU_TARGET "${one_npu_register}")
string(REPLACE "," "" NPU_TARGET "${NPU_TARGET}")
# [ \t\r\n]+ is used for blank characters.
# Here we use '+' instead of '*' since it is a REPLACE operation.
string(REGEX REPLACE "[ \t\r\n]+" "" NPU_TARGET "${NPU_TARGET}")
foreach(npu_src ${npu_cc_srcs})
set(op_name "")
find_register(${npu_src} "REGISTER_OP_NPU_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, NPU);\n")
set(pybind_flag 1)
endif()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${NPU_TARGET}, NPU);\n")
endforeach()
endif()
if (WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
file(READ ${ORIGINAL_TARGET}_mlu.cc TARGET_MLU_CONTENT)
# It is different from the logic above, becareful
string(REGEX MATCH "REGISTER_OP_MLU_KERNEL\\(.*" multi_mlu_register "${TARGET_MLU_CONTENT}")
# [ \t\r\n]* is used for blank characters
string(REGEX MATCH "REGISTER_OP_MLU_KERNEL\\([ \t\r\n]*[a-z0-9_]*," one_mlu_register "${multi_mlu_register}")

if (one_mlu_register STREQUAL "")
string(REPLACE "_op" "" MLU_TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP_MLU_KERNEL(" "" MLU_TARGET "${one_mlu_register}")
string(REPLACE "," "" MLU_TARGET "${MLU_TARGET}")
# [ \t\r\n]+ is used for blank characters.
# Here we use '+' instead of '*' since it is a REPLACE operation.
string(REGEX REPLACE "[ \t\r\n]+" "" MLU_TARGET "${MLU_TARGET}")
# pybind USE_OP_DEVICE_KERNEL for MLU
if (WITH_MLU AND ${mlu_cc_srcs_len} GREATER 0)
foreach(mlu_src ${mlu_cc_srcs})
set(op_name "")
find_register(${mlu_src} "REGISTER_OP_MLU_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MLU);\n")
set(pybind_flag 1)
endif()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${MLU_TARGET}, MLU);\n")
endforeach()
endif()

# pybind USE_OP_DEVICE_KERNEL for MKLDNN
Expand All @@ -377,10 +401,26 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
foreach(mkldnn_src ${mkldnn_cc_srcs})
set(op_name "")
find_register(${mkldnn_src} "REGISTER_OP_KERNEL" op_name)
if(NOT ${op_name} EQUAL "")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${op_name}, MKLDNN);\n")
set(pybind_flag 1)
endif()
endforeach()
endif()
endif()

# pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
string(REPLACE "_op" "" TARGET "${TARGET}")
if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
set(pybind_flag 1)
endif()

# pybind USE_OP
if (${pybind_flag} EQUAL 0)
# NOTE(*): activation use macro to regist the kernels, set use_op manually.
Expand Down
6 changes: 5 additions & 1 deletion paddle/fluid/eager/auto_code_generator/eager_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
#include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/string/string_helper.h"

// pten
#include "paddle/pten/kernels/declarations.h"

#define NUM_CREATED_DUP_INPUTS 4

namespace paddle {
Expand Down Expand Up @@ -535,7 +538,8 @@ static bool CheckOpProto(proto::OpProto* op_proto) {
// Skip ooerator which is not inherit form OperatorWithKernel, like while,
// since only OperatorWithKernel can run in dygraph mode.
auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels();
if (!all_kernels.count(op_type)) {
if (!all_kernels.count(op_type) &&
!pten::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) {
return false;
}

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/eager/legacy/op_runner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ void OpRunImpl(const paddle::framework::OperatorBase& op,
prepared_op.Run(*tmp_ins_ptr, outs, attrs, default_attrs);
}

VLOG(6) << "Run Prepared Op end";
// TODO(jiabin): Set the output var's grad Forward DataType
}

Expand Down
Loading

0 comments on commit 3ab9aef

Please sign in to comment.