Skip to content

Commit

Permalink
[Cherry-Pick] AMP OP&Test support from Hackathon (#53522)
Browse files Browse the repository at this point in the history
低精度算子支持和单测补充,合并 cherry pick 17个Hackathon PR,共覆盖25个OP的低精度支持及完善
  • Loading branch information
ZzSean authored May 6, 2023
1 parent 584d610 commit 39b704c
Show file tree
Hide file tree
Showing 70 changed files with 2,282 additions and 230 deletions.
4 changes: 4 additions & 0 deletions paddle/fluid/operators/math/prelu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/math/prelu.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/float16.h"

namespace paddle {
Expand Down Expand Up @@ -135,14 +136,17 @@ void PreluScalarDirectCUDAFunctor<T>::operator()(gpuStream_t stream,

template class PreluChannelWiseDirectCUDAFunctor<float>;
template class PreluChannelWiseDirectCUDAFunctor<platform::float16>;
template class PreluChannelWiseDirectCUDAFunctor<platform::bfloat16>;
template class PreluChannelWiseDirectCUDAFunctor<double>;

template class PreluElementWiseDirectCUDAFunctor<float>;
template class PreluElementWiseDirectCUDAFunctor<platform::float16>;
template class PreluElementWiseDirectCUDAFunctor<platform::bfloat16>;
template class PreluElementWiseDirectCUDAFunctor<double>;

template class PreluScalarDirectCUDAFunctor<float>;
template class PreluScalarDirectCUDAFunctor<platform::float16>;
template class PreluScalarDirectCUDAFunctor<platform::bfloat16>;
template class PreluScalarDirectCUDAFunctor<double>;

} // namespace math
Expand Down
68 changes: 68 additions & 0 deletions paddle/phi/kernels/funcs/blas/blas_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -1316,6 +1316,74 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
}

template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(bool transA,
bool transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
int lda,
const phi::dtype::bfloat16 *B,
int ldb,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C,
int ldc) const {
#if CUDA_VERSION >= 11000
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;

PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
80,
phi::errors::InvalidArgument(
"cublas bf16 gemm requires GPU compute capability >= 80,"
"but received %d",
context_.GetComputeCapability()));

float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);

cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
bool use_tensor_op_math = context_.tensor_core_available();
if (use_tensor_op_math) {
algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
}
VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");

context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
CUDA_R_16BF,
ldb,
A,
CUDA_R_16BF,
lda,
&h_beta,
C,
CUDA_R_16BF,
ldc,
CUDA_R_32F,
algo));
});
#else
// raise error
PADDLE_THROW(phi::errors::Unimplemented(
"cublasGemmEx with bfloat16 is not supported on cuda <= 11"));

#endif // CUDA_VERSION >= 11000
}

template <>
template <typename T>
void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
Expand Down
66 changes: 65 additions & 1 deletion paddle/phi/kernels/funcs/blas/blas_impl.hip.h
Original file line number Diff line number Diff line change
Expand Up @@ -751,7 +751,7 @@ inline void Blas<phi::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
context_.GetComputeCapability(),
80,
phi::errors::InvalidArgument(
"rocblas fp16 gemm requires GPU compute capability >= 80,"
"rocblas bf16 gemm requires GPU compute capability >= 80,"
"but received %d",
context_.GetComputeCapability()));

Expand Down Expand Up @@ -982,6 +982,70 @@ inline void Blas<phi::GPUContext>::GEMM(bool transA,
});
}

template <>
template <>
inline void Blas<phi::GPUContext>::GEMM(bool transA,
bool transB,
int M,
int N,
int K,
phi::dtype::bfloat16 alpha,
const phi::dtype::bfloat16 *A,
int lda,
const phi::dtype::bfloat16 *B,
int ldb,
phi::dtype::bfloat16 beta,
phi::dtype::bfloat16 *C,
int ldc) const {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
rocblas_operation cuTransA = (transA == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
rocblas_operation cuTransB = (transB == CblasNoTrans)
? rocblas_operation_none
: rocblas_operation_transpose;
PADDLE_ENFORCE_GE(
context_.GetComputeCapability(),
80,
phi::errors::InvalidArgument(
"rocblas bf16 gemm requires GPU compute capability >= 80,"
"but received %d",
context_.GetComputeCapability()));

float h_alpha = static_cast<float>(alpha);
float h_beta = static_cast<float>(beta);
rocblas_gemm_algo algo = rocblas_gemm_algo_standard;

context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::rocblas_gemm_ex(handle,
cuTransB,
cuTransA,
N,
M,
K,
&h_alpha,
B,
rocblas_datatype_bf16_r,
ldb,
A,
rocblas_datatype_bf16_r,
lda,
&h_beta,
C,
rocblas_datatype_bf16_r,
ldc,
C,
rocblas_datatype_bf16_r,
ldc,
rocblas_datatype_f32_r,
algo,
0,
0));
});
}

template <>
template <typename T>
void Blas<phi::GPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
Expand Down
23 changes: 23 additions & 0 deletions paddle/phi/kernels/funcs/elementwise_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#pragma once

#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/enforce.h"
Expand Down Expand Up @@ -189,6 +190,17 @@ struct FMinFunctor<dtype::float16> {
}
};

template <>
struct FMinFunctor<dtype::bfloat16> {
inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a,
const dtype::bfloat16 b) const {
float float_a = static_cast<float>(a);
float float_b = static_cast<float>(b);
auto result = std::fmin(float_a, float_b);
return static_cast<dtype::bfloat16>(result);
}
};

template <>
struct FMinFunctor<int> {
inline HOSTDEVICE int operator()(const int a, const int b) const {
Expand Down Expand Up @@ -228,6 +240,17 @@ struct FMaxFunctor<dtype::float16> {
}
};

template <>
struct FMaxFunctor<dtype::bfloat16> {
inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a,
const dtype::bfloat16 b) const {
float float_a = static_cast<float>(a);
float float_b = static_cast<float>(b);
auto result = std::fmax(float_a, float_b);
return static_cast<dtype::bfloat16>(result);
}
};

template <>
struct FMaxFunctor<int> {
inline HOSTDEVICE int operator()(const int a, const int b) const {
Expand Down
14 changes: 8 additions & 6 deletions paddle/phi/kernels/funcs/gather_scatter_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,14 @@ limitations under the License. */
namespace phi {
namespace funcs {

#define Instantiate_Template_Function(func) \
Instantiate_Template_Function_index_t( \
func, int) Instantiate_Template_Function_index_t(func, float) \
Instantiate_Template_Function_index_t(func, double) \
Instantiate_Template_Function_index_t(func, int64_t) \
Instantiate_Template_Function_index_t(func, phi::dtype::float16) \
#define Instantiate_Template_Function(func) \
Instantiate_Template_Function_index_t( \
func, int) Instantiate_Template_Function_index_t(func, float) \
Instantiate_Template_Function_index_t( \
func, double) Instantiate_Template_Function_index_t(func, int64_t) \
Instantiate_Template_Function_index_t(func, phi::dtype::float16) \
Instantiate_Template_Function_index_t(func, \
phi::dtype::bfloat16) \
Instantiate_Template_Function_index_t(func, unsigned char)

#define Instantiate_Template_Function_index_t(func, tensor_t) \
Expand Down
30 changes: 27 additions & 3 deletions paddle/phi/kernels/funcs/im2col.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/backends/gpu/gpu_primitives.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/kernels/funcs/im2col.h"

namespace phi {
Expand Down Expand Up @@ -71,7 +72,7 @@ __global__ void im2col(const T* data_im,
}
*data_col =
(rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
? 0
? T(0)
: data_im[im_idx];
data_col += col_height * col_width;
}
Expand Down Expand Up @@ -173,7 +174,7 @@ __global__ void col2im(int n,
int input_channels = n / im_height / im_width;

if (index < n) {
T val = 0;
T val = static_cast<T>(0);
int w = (data_layout != DataLayout::kNHWC
? index % im_width + padding_width
: (index / input_channels) % im_width + padding_width);
Expand Down Expand Up @@ -309,12 +310,24 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
phi::GPUContext,
double>;
template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
phi::GPUContext,
phi::dtype::float16>;
template class Im2ColFunctor<phi::funcs::ColFormat::kCFO,
phi::GPUContext,
phi::dtype::bfloat16>;
template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
phi::GPUContext,
float>;
template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
phi::GPUContext,
double>;
template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
phi::GPUContext,
phi::dtype::float16>;
template class Col2ImFunctor<phi::funcs::ColFormat::kCFO,
phi::GPUContext,
phi::dtype::bfloat16>;

template <class T>
__global__ void im2colOCF(const T* im_data,
Expand Down Expand Up @@ -560,13 +573,24 @@ template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
phi::GPUContext,
double>;

template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
phi::GPUContext,
phi::dtype::float16>;
template class Im2ColFunctor<phi::funcs::ColFormat::kOCF,
phi::GPUContext,
phi::dtype::bfloat16>;
template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
phi::GPUContext,
float>;
template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
phi::GPUContext,
double>;
template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
phi::GPUContext,
phi::dtype::float16>;
template class Col2ImFunctor<phi::funcs::ColFormat::kOCF,
phi::GPUContext,
phi::dtype::bfloat16>;

} // namespace funcs
} // namespace phi
24 changes: 20 additions & 4 deletions paddle/phi/kernels/funcs/pooling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1963,7 +1963,7 @@ __global__ void KernelMaxPool2dWithIdx(const int nthreads,
wstart = max(wstart, 0);
}

T1 ele = -FLT_MAX;
T1 ele = static_cast<T1>(-FLT_MAX);
int max_index = -1;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
Expand Down Expand Up @@ -2015,7 +2015,7 @@ __global__ void AdaptiveKernelMaxPool2dWithIdx(const int nthreads,
wstart = AdaptStartIndex(w_offset, input_width, output_width);
wend = AdaptEndIndex(w_offset, input_width, output_width);

T1 ele = -FLT_MAX;
T1 ele = static_cast<T1>(-FLT_MAX);
int max_index = -1;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
Expand Down Expand Up @@ -2089,7 +2089,7 @@ __global__ void KernelMaxPool2DWithIdxGrad(const int nthreads,
pwend = min((w_offset + padding_width) / stride_width + 1, output_width);
}

T1 input_grad_data = 0;
T1 input_grad_data = static_cast<T1>(0);
int input_current_featuremap_idx = h_offset * input_width + w_offset;
for (int ph = phstart; ph < phend; ++ph) {
for (int pw = pwstart; pw < pwend; ++pw) {
Expand Down Expand Up @@ -2259,6 +2259,14 @@ template class MaxPool2dWithIndexFunctor<phi::GPUContext, float, int>;
template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, float, int>;
template class MaxPool2dWithIndexFunctor<phi::GPUContext, double, int>;
template class MaxPool2dWithIndexGradFunctor<phi::GPUContext, double, int>;
template class MaxPool2dWithIndexFunctor<phi::GPUContext, dtype::float16, int>;
template class MaxPool2dWithIndexGradFunctor<phi::GPUContext,
dtype::float16,
int>;
template class MaxPool2dWithIndexFunctor<phi::GPUContext, dtype::bfloat16, int>;
template class MaxPool2dWithIndexGradFunctor<phi::GPUContext,
dtype::bfloat16,
int>;

template <typename T1, typename T2>
__global__ void KernelMaxPool3DWithIdx(const int ncd,
Expand Down Expand Up @@ -2324,7 +2332,7 @@ __global__ void KernelMaxPool3DWithIdx(const int ncd,
wstart = max(wstart, 0);
}

T1 ele = -FLT_MAX;
T1 ele = static_cast<T1>(-FLT_MAX);
int max_index = -1;
for (int d = dstart; d < dend; ++d) {
for (int h = hstart; h < hend; ++h) {
Expand Down Expand Up @@ -2560,6 +2568,14 @@ template class MaxPool3dWithIndexFunctor<phi::GPUContext, float, int>;
template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, float, int>;
template class MaxPool3dWithIndexFunctor<phi::GPUContext, double, int>;
template class MaxPool3dWithIndexGradFunctor<phi::GPUContext, double, int>;
template class MaxPool3dWithIndexFunctor<phi::GPUContext, dtype::float16, int>;
template class MaxPool3dWithIndexGradFunctor<phi::GPUContext,
dtype::float16,
int>;
template class MaxPool3dWithIndexFunctor<phi::GPUContext, dtype::bfloat16, int>;
template class MaxPool3dWithIndexGradFunctor<phi::GPUContext,
dtype::bfloat16,
int>;

} // namespace funcs
} // namespace phi
Loading

0 comments on commit 39b704c

Please sign in to comment.