diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu index 534a9ca35e3a6..9dc25e30ce9aa 100644 --- a/paddle/fluid/operators/math/prelu.cu +++ b/paddle/fluid/operators/math/prelu.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/prelu.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" namespace paddle { @@ -135,14 +136,17 @@ void PreluScalarDirectCUDAFunctor::operator()(gpuStream_t stream, template class PreluChannelWiseDirectCUDAFunctor; template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; template class PreluChannelWiseDirectCUDAFunctor; template class PreluElementWiseDirectCUDAFunctor; template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; template class PreluElementWiseDirectCUDAFunctor; template class PreluScalarDirectCUDAFunctor; template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; template class PreluScalarDirectCUDAFunctor; } // namespace math diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h index 35769d0e6e338..e39c860bc8096 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.cu.h @@ -1316,6 +1316,74 @@ inline void Blas::GEMM(bool transA, }); } +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + int lda, + const phi::dtype::bfloat16 *B, + int ldb, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int ldc) const { +#if CUDA_VERSION >= 11000 + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 80, + phi::errors::InvalidArgument( + "cublas bf16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cublasGemmEx(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + CUDA_R_16BF, + ldb, + A, + CUDA_R_16BF, + lda, + &h_beta, + C, + CUDA_R_16BF, + ldc, + CUDA_R_32F, + algo)); + }); +#else + // raise error + PADDLE_THROW(phi::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 +} + template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h index 5edfe3a602c7b..bb02242e2db72 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h @@ -751,7 +751,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, context_.GetComputeCapability(), 80, phi::errors::InvalidArgument( - "rocblas fp16 gemm requires GPU compute capability >= 80," + "rocblas bf16 gemm requires GPU compute capability >= 80," "but received %d", context_.GetComputeCapability())); @@ -982,6 +982,70 @@ inline void Blas::GEMM(bool transA, }); } +template <> +template <> +inline void Blas::GEMM(bool transA, + bool transB, + int M, + int N, + int K, + phi::dtype::bfloat16 alpha, + const phi::dtype::bfloat16 *A, + int lda, + const phi::dtype::bfloat16 *B, + int ldb, + phi::dtype::bfloat16 beta, + phi::dtype::bfloat16 *C, + int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), + 80, + phi::errors::InvalidArgument( + "rocblas bf16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::rocblas_gemm_ex(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + rocblas_datatype_bf16_r, + ldb, + A, + rocblas_datatype_bf16_r, + lda, + &h_beta, + C, + rocblas_datatype_bf16_r, + ldc, + C, + rocblas_datatype_bf16_r, + ldc, + rocblas_datatype_f32_r, + algo, + 0, + 0)); + }); +} + template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h index 3543c0c6aa066..508240a991644 100644 --- a/paddle/phi/kernels/funcs/elementwise_functor.h +++ b/paddle/phi/kernels/funcs/elementwise_functor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/bfloat16.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/common/float16.h" #include "paddle/phi/core/enforce.h" @@ -189,6 +190,17 @@ struct FMinFunctor { } }; +template <> +struct FMinFunctor { + inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a, + const dtype::bfloat16 b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmin(float_a, float_b); + return static_cast(result); + } +}; + template <> struct FMinFunctor { inline HOSTDEVICE int operator()(const int a, const int b) const { @@ -228,6 +240,17 @@ struct FMaxFunctor { } }; +template <> +struct FMaxFunctor { + inline HOSTDEVICE dtype::bfloat16 operator()(const dtype::bfloat16 a, + const dtype::bfloat16 b) const { + float float_a = static_cast(a); + float float_b = static_cast(b); + auto result = std::fmax(float_a, float_b); + return static_cast(result); + } +}; + template <> struct FMaxFunctor { inline HOSTDEVICE int operator()(const int a, const int b) const { diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.h b/paddle/phi/kernels/funcs/gather_scatter_functor.h index 00e8f45d8ffb0..56068f9459ebd 100644 --- a/paddle/phi/kernels/funcs/gather_scatter_functor.h +++ b/paddle/phi/kernels/funcs/gather_scatter_functor.h @@ -21,12 +21,14 @@ limitations under the License. */ namespace phi { namespace funcs { -#define Instantiate_Template_Function(func) \ - Instantiate_Template_Function_index_t( \ - func, int) Instantiate_Template_Function_index_t(func, float) \ - Instantiate_Template_Function_index_t(func, double) \ - Instantiate_Template_Function_index_t(func, int64_t) \ - Instantiate_Template_Function_index_t(func, phi::dtype::float16) \ +#define Instantiate_Template_Function(func) \ + Instantiate_Template_Function_index_t( \ + func, int) Instantiate_Template_Function_index_t(func, float) \ + Instantiate_Template_Function_index_t( \ + func, double) Instantiate_Template_Function_index_t(func, int64_t) \ + Instantiate_Template_Function_index_t(func, phi::dtype::float16) \ + Instantiate_Template_Function_index_t(func, \ + phi::dtype::bfloat16) \ Instantiate_Template_Function_index_t(func, unsigned char) #define Instantiate_Template_Function_index_t(func, tensor_t) \ diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu index 78a0b345e389f..87c82adbb7fbe 100644 --- a/paddle/phi/kernels/funcs/im2col.cu +++ b/paddle/phi/kernels/funcs/im2col.cu @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/funcs/im2col.h" namespace phi { @@ -71,7 +72,7 @@ __global__ void im2col(const T* data_im, } *data_col = (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0) - ? 0 + ? T(0) : data_im[im_idx]; data_col += col_height * col_width; } @@ -173,7 +174,7 @@ __global__ void col2im(int n, int input_channels = n / im_height / im_width; if (index < n) { - T val = 0; + T val = static_cast(0); int w = (data_layout != DataLayout::kNHWC ? index % im_width + padding_width : (index / input_channels) % im_width + padding_width); @@ -309,12 +310,24 @@ template class Im2ColFunctor; +template class Im2ColFunctor; +template class Im2ColFunctor; template class Col2ImFunctor; template class Col2ImFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; template __global__ void im2colOCF(const T* im_data, @@ -560,13 +573,24 @@ template class Im2ColFunctor; - +template class Im2ColFunctor; +template class Im2ColFunctor; template class Col2ImFunctor; template class Col2ImFunctor; +template class Col2ImFunctor; +template class Col2ImFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu index d0a0416994169..2f89b51815e64 100644 --- a/paddle/phi/kernels/funcs/pooling.cu +++ b/paddle/phi/kernels/funcs/pooling.cu @@ -1963,7 +1963,7 @@ __global__ void KernelMaxPool2dWithIdx(const int nthreads, wstart = max(wstart, 0); } - T1 ele = -FLT_MAX; + T1 ele = static_cast(-FLT_MAX); int max_index = -1; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -2015,7 +2015,7 @@ __global__ void AdaptiveKernelMaxPool2dWithIdx(const int nthreads, wstart = AdaptStartIndex(w_offset, input_width, output_width); wend = AdaptEndIndex(w_offset, input_width, output_width); - T1 ele = -FLT_MAX; + T1 ele = static_cast(-FLT_MAX); int max_index = -1; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -2089,7 +2089,7 @@ __global__ void KernelMaxPool2DWithIdxGrad(const int nthreads, pwend = min((w_offset + padding_width) / stride_width + 1, output_width); } - T1 input_grad_data = 0; + T1 input_grad_data = static_cast(0); int input_current_featuremap_idx = h_offset * input_width + w_offset; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { @@ -2259,6 +2259,14 @@ template class MaxPool2dWithIndexFunctor; template class MaxPool2dWithIndexGradFunctor; template class MaxPool2dWithIndexFunctor; template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template __global__ void KernelMaxPool3DWithIdx(const int ncd, @@ -2324,7 +2332,7 @@ __global__ void KernelMaxPool3DWithIdx(const int ncd, wstart = max(wstart, 0); } - T1 ele = -FLT_MAX; + T1 ele = static_cast(-FLT_MAX); int max_index = -1; for (int d = dstart; d < dend; ++d) { for (int h = hstart; h < hend; ++h) { @@ -2560,6 +2568,14 @@ template class MaxPool3dWithIndexFunctor; template class MaxPool3dWithIndexGradFunctor; template class MaxPool3dWithIndexFunctor; template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu index 65978da1374e4..9d915af9170f6 100644 --- a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu @@ -18,5 +18,11 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h" -PD_REGISTER_KERNEL( - addmm_grad, GPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {} +PD_REGISTER_KERNEL(addmm_grad, + GPU, + ALL_LAYOUT, + phi::AddmmGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu index 7b589ce20acca..563b137040ac7 100644 --- a/paddle/phi/kernels/gpu/addmm_kernel.cu +++ b/paddle/phi/kernels/gpu/addmm_kernel.cu @@ -18,4 +18,11 @@ limitations under the License. */ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/addmm_kernel_impl.h" -PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {} +PD_REGISTER_KERNEL(addmm, + GPU, + ALL_LAYOUT, + phi::AddmmKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu index 7acfd33e94a9a..a202c428394d7 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_grad_kernel.cu @@ -16,7 +16,7 @@ #include -#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" #include "paddle/phi/core/kernel_registry.h" @@ -104,8 +104,10 @@ PD_REGISTER_KERNEL(broadcast_tensors_grad, GPU, ALL_LAYOUT, phi::BroadcastTensorsGradKernel, + bool, int, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu index 82dac4552a42f..3d16797cb66c0 100644 --- a/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu +++ b/paddle/phi/kernels/gpu/broadcast_tensors_kernel.cu @@ -14,7 +14,7 @@ #include "paddle/phi/kernels/broadcast_tensors_kernel.h" -#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/broadcast_tensors_kernel_impl.h" @@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(broadcast_tensors, int64_t, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu index 6c3abf843f998..cd80246944967 100644 --- a/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/clip_by_norm_kernel.cu @@ -17,7 +17,7 @@ #include #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/reduce_function.h" @@ -34,7 +34,7 @@ void ClipByNormKernel(const Context& dev_ctx, return ClipByNormFunctor(dev_ctx, in, max_norm, output); } auto input = ∈ - dev_ctx.template Alloc(output); + dev_ctx.template Alloc(output); PADDLE_ENFORCE_NOT_NULL(input, phi::errors::InvalidArgument( @@ -49,20 +49,14 @@ void ClipByNormKernel(const Context& dev_ctx, auto* tmp = &tmp_tensor; tmp->Resize({1}); dev_ctx.template Alloc(tmp); - phi::funcs::ReduceKernel>( - dev_ctx, - *input, - tmp, - kps::SquareFunctor(), - reduce_dims); + phi::funcs:: + ReduceKernel>( + dev_ctx, *input, tmp, kps::SquareFunctor(), reduce_dims); auto tmp_eigen = phi::EigenVector::Flatten(*tmp); auto x_norm = tmp_eigen.sqrt(); - auto x = phi::EigenVector::Flatten(*input); - auto out = phi::EigenVector::Flatten(*output); + auto x = phi::EigenVector::Flatten(*input); + auto out = phi::EigenVector::Flatten(*output); auto* place = dev_ctx.eigen_device(); auto temp = (x_norm <= max_norm).template cast(); @@ -72,7 +66,7 @@ void ClipByNormKernel(const Context& dev_ctx, auto scaling = (temp + (static_cast(1) - temp) * max_norm / (x_norm + epsilon)) - .template cast(); + .template cast(); Eigen::array one_dim{{1}}; Eigen::DSizes m_dsize(input->numel()); @@ -86,4 +80,5 @@ PD_REGISTER_KERNEL(clip_by_norm, ALL_LAYOUT, phi::ClipByNormKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 4ff1164e64e08..0dac34f5baccd 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -98,6 +98,7 @@ PD_REGISTER_KERNEL(fmax_grad, double, int, phi::dtype::float16, + phi::dtype::bfloat16, int64_t) {} PD_REGISTER_KERNEL(fmin_grad, @@ -108,6 +109,7 @@ PD_REGISTER_KERNEL(fmin_grad, double, int, phi::dtype::float16, + phi::dtype::bfloat16, int64_t) {} PD_REGISTER_KERNEL(maximum_grad, diff --git a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu index 88e42a16dba36..c0d5b737c5fbb 100644 --- a/paddle/phi/kernels/gpu/index_add_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/index_add_grad_kernel.cu @@ -105,5 +105,6 @@ PD_REGISTER_KERNEL(index_add_grad, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/index_add_kernel.cu b/paddle/phi/kernels/gpu/index_add_kernel.cu index f4894fc6d9308..9ba5bde7f6136 100644 --- a/paddle/phi/kernels/gpu/index_add_kernel.cu +++ b/paddle/phi/kernels/gpu/index_add_kernel.cu @@ -123,5 +123,6 @@ PD_REGISTER_KERNEL(index_add, float, double, phi::dtype::float16, + phi::dtype::bfloat16, int, int64_t) {} diff --git a/paddle/phi/kernels/gpu/logspace_kernel.cu b/paddle/phi/kernels/gpu/logspace_kernel.cu index b5e4904fdcf69..458600e2f29e1 100644 --- a/paddle/phi/kernels/gpu/logspace_kernel.cu +++ b/paddle/phi/kernels/gpu/logspace_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/logspace_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/data_type_transform.h" @@ -25,25 +26,34 @@ namespace phi { template __global__ void LogspaceKernelInner( T start, T stop, double step, T base, int64_t size, T* out) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType mt_start = static_cast(start); + MPType mt_stop = static_cast(stop); + MPType mt_base = static_cast(base); + int64_t index = blockIdx.x * blockDim.x + threadIdx.x; for (; index < size; index += blockDim.x * gridDim.x) { if (index < size / 2) { out[index] = - static_cast(pow(static_cast(base), - static_cast(start + step * index))); + static_cast(pow(static_cast(mt_base), + static_cast(mt_start + step * index))); } else { out[index] = static_cast( - pow(static_cast(base), - static_cast(stop - step * (size - index - 1)))); + pow(static_cast(mt_base), + static_cast(mt_stop - step * (size - index - 1)))); } } } template __global__ void LogspaceSpecialKernel(T start, T base, T* out) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + MPType mt_start = static_cast(start); + MPType mt_base = static_cast(base); + out[0] = static_cast( - pow(static_cast(base), static_cast(start))); + pow(static_cast(mt_base), static_cast(mt_start))); } template @@ -54,6 +64,8 @@ void LogspaceKernel(const Context& ctx, const DenseTensor& base, DataType dtype, DenseTensor* out) { + using MPType = typename phi::dtype::MPTypeTrait::Type; + auto start_t = phi::funcs::TransDataType(ctx, start, dtype); auto stop_t = phi::funcs::TransDataType(ctx, stop, dtype); auto base_t = phi::funcs::TransDataType(ctx, base, dtype); @@ -71,6 +83,9 @@ void LogspaceKernel(const Context& ctx, phi::Copy(ctx, base_t, phi::CPUPlace(), false, &n_base); T base_data = n_base.data()[0]; + MPType mt_start_data = static_cast(start_data); + MPType mt_stop_data = static_cast(stop_data); + PADDLE_ENFORCE_GT( num, 0, @@ -86,7 +101,7 @@ void LogspaceKernel(const Context& ctx, int block = 512; int grid = (num + block - 1) / block; if (num != 1) { - step = (static_cast(stop_data - start_data)) / (num - 1); + step = (static_cast(mt_stop_data - mt_start_data)) / (num - 1); LogspaceKernelInner<<>>( start_data, stop_data, step, base_data, num, out_data); } else { @@ -104,4 +119,6 @@ PD_REGISTER_KERNEL(logspace, float, int32_t, int64_t, - double) {} + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu index 4b48e2eb17322..e27d5251a9144 100644 --- a/paddle/phi/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/matmul_grad_kernel.cu @@ -64,4 +64,5 @@ PD_REGISTER_KERNEL(matmul_with_flatten_double_grad, phi::MatmulWithFlattenDoubleGradKernel, float, double, + phi::dtype::bfloat16, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu index 9c665ed1233e0..b1da8858a8bcd 100644 --- a/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/multi_dot_grad_kernel.cu @@ -15,16 +15,15 @@ limitations under the License. */ #include "paddle/phi/kernels/multi_dot_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" -using float16 = phi::dtype::float16; - PD_REGISTER_KERNEL(multi_dot_grad, GPU, ALL_LAYOUT, phi::MultiDotGradKernel, float, double, - float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/multi_dot_kernel.cu b/paddle/phi/kernels/gpu/multi_dot_kernel.cu index be73e2d162a4b..f6328f400cd2d 100644 --- a/paddle/phi/kernels/gpu/multi_dot_kernel.cu +++ b/paddle/phi/kernels/gpu/multi_dot_kernel.cu @@ -15,11 +15,15 @@ limitations under the License. */ #include "paddle/phi/kernels/multi_dot_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/float16.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/multi_dot_kernel_impl.h" -using float16 = phi::dtype::float16; - -PD_REGISTER_KERNEL( - multi_dot, GPU, ALL_LAYOUT, phi::MultiDotKernel, float, double, float16) {} +PD_REGISTER_KERNEL(multi_dot, + GPU, + ALL_LAYOUT, + phi::MultiDotKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/pool_grad_kernel.cu b/paddle/phi/kernels/gpu/pool_grad_kernel.cu index e4cfcb23b730e..c625977543558 100644 --- a/paddle/phi/kernels/gpu/pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/pool_grad_kernel.cu @@ -38,7 +38,9 @@ PD_REGISTER_KERNEL(max_pool2d_with_index_grad, ALL_LAYOUT, phi::MaxPool2dWithIndexGradKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } @@ -55,6 +57,8 @@ PD_REGISTER_KERNEL(max_pool3d_with_index_grad, ALL_LAYOUT, phi::MaxPool3dWithIndexGradKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } diff --git a/paddle/phi/kernels/gpu/pool_kernel.cu b/paddle/phi/kernels/gpu/pool_kernel.cu index 65d0ef4bdc916..511cc263bc760 100644 --- a/paddle/phi/kernels/gpu/pool_kernel.cu +++ b/paddle/phi/kernels/gpu/pool_kernel.cu @@ -32,7 +32,9 @@ PD_REGISTER_KERNEL(max_pool2d_with_index, ALL_LAYOUT, phi::MaxPool2dWithIndexKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->OutputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } @@ -49,6 +51,8 @@ PD_REGISTER_KERNEL(max_pool3d_with_index, ALL_LAYOUT, phi::MaxPool3dWithIndexKernel, float, - double) { + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->OutputAt(1).SetDataType(phi::CppTypeToDataType::Type()); } diff --git a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu index 2a9512e495cc9..b613cdc606ece 100644 --- a/paddle/phi/kernels/gpu/prelu_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/prelu_grad_kernel.cu @@ -189,4 +189,5 @@ PD_REGISTER_KERNEL(prelu_grad, phi::PReluGradKernel, float, phi::dtype::float16, + phi::dtype::bfloat16, double) {} diff --git a/paddle/phi/kernels/gpu/prelu_kernel.cu b/paddle/phi/kernels/gpu/prelu_kernel.cu index 679d7ba3f2e7d..246b6fdc95d28 100644 --- a/paddle/phi/kernels/gpu/prelu_kernel.cu +++ b/paddle/phi/kernels/gpu/prelu_kernel.cu @@ -79,4 +79,5 @@ PD_REGISTER_KERNEL(prelu, phi::PReluKernel, float, phi::dtype::float16, + phi::dtype::bfloat16, double) {} diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu index 77276f244a2b7..8321bcd1aa7ac 100644 --- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu @@ -75,4 +75,5 @@ PD_REGISTER_KERNEL(put_along_axis_grad, double, int64_t, int, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu index e44580fd2da85..b63047973e9b8 100644 --- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu @@ -82,4 +82,5 @@ PD_REGISTER_KERNEL(put_along_axis, double, int64_t, int, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index 6ae0e6df07cfb..456d541451e64 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -28,6 +28,7 @@ namespace cub = hipcub; #include "gflags/gflags.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -165,4 +166,6 @@ PD_REGISTER_KERNEL(randperm, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu index ea1d377c45976..86cccc5e03b1c 100644 --- a/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_min_grad_kernel.cu @@ -16,8 +16,63 @@ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" -#include "paddle/phi/kernels/impl/reduce_min_grad_kernel_impl.h" +#include "paddle/phi/kernels/funcs/broadcast_function.h" +#include "paddle/phi/kernels/funcs/compare_functors.h" +#include "paddle/phi/kernels/funcs/elementwise_functor.h" +#include "paddle/phi/kernels/funcs/reduce_function.h" +namespace phi { + +template +void ReduceMinGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& out, + const DenseTensor& out_grad, + const IntArray& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* x_grad) { + dev_ctx.Alloc(x_grad, x.dtype()); + reduce_all = recompute_reduce_all(x, dims, reduce_all); + + // get reduce_dim + int dim_size = x.dims().size(); + auto reduce_dims = + funcs::details::GetReduceDim(dims.GetData(), dim_size, reduce_all); + auto update_dims = vectorize(x.dims()); + for (auto i : reduce_dims) { + update_dims[i] = 1; + } + + // make new tensor of out and out_grad + phi::DenseTensor new_out(out.type()); + new_out.ShareDataWith(out); + new_out.Resize(phi::make_ddim(update_dims)); + + phi::DenseTensor new_out_grad(out_grad.type()); + new_out_grad.ShareDataWith(out_grad); + new_out_grad.Resize(phi::make_ddim(update_dims)); + + // make equal_out + phi::DenseTensor* equal_out = new phi::DenseTensor(); + equal_out->Resize(x.dims()); + dev_ctx.template Alloc(equal_out); + + // compute + // 1. equal_out = Equal(x, y) + std::vector equal_inputs = {&new_out, &x}; + std::vector equal_outputs = {equal_out}; + funcs::BroadcastKernel( + dev_ctx, equal_inputs, &equal_outputs, 0, funcs::EqualFunctor()); + + // 2. dx = dout * 1 + std::vector mul_inputs = {&new_out_grad, equal_out}; + std::vector mul_outputs = {x_grad}; + funcs::BroadcastKernel( + dev_ctx, mul_inputs, &mul_outputs, 0, funcs::MultiplyFunctor()); + delete equal_out; +} +} // namespace phi PD_REGISTER_KERNEL(min_grad, GPU, ALL_LAYOUT, @@ -25,4 +80,6 @@ PD_REGISTER_KERNEL(min_grad, float, double, int, - int64_t) {} + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu index c4fffce6ccc07..f38ab6416693f 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_grad_kernel.cu @@ -68,4 +68,5 @@ PD_REGISTER_KERNEL(take_along_axis_grad, double, int64_t, int, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu index 3c919f5acd1cd..6ad828f2680b3 100644 --- a/paddle/phi/kernels/gpu/take_along_axis_kernel.cu +++ b/paddle/phi/kernels/gpu/take_along_axis_kernel.cu @@ -54,4 +54,5 @@ PD_REGISTER_KERNEL(take_along_axis, double, int64_t, int, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/trace_grad_kernel.cu b/paddle/phi/kernels/gpu/trace_grad_kernel.cu index 6692c1e19b033..a97e71a01874e 100644 --- a/paddle/phi/kernels/gpu/trace_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_grad_kernel.cu @@ -27,5 +27,6 @@ PD_REGISTER_KERNEL(trace_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/trace_kernel.cu b/paddle/phi/kernels/gpu/trace_kernel.cu index 4a749c5b3347d..671ca490e136a 100644 --- a/paddle/phi/kernels/gpu/trace_kernel.cu +++ b/paddle/phi/kernels/gpu/trace_kernel.cu @@ -52,5 +52,6 @@ PD_REGISTER_KERNEL(trace, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/unfold_grad_kernel.cu b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu index 9cd874ec6a6f7..3484fe8fdc75e 100644 --- a/paddle/phi/kernels/gpu/unfold_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/unfold_grad_kernel.cu @@ -18,5 +18,11 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unfold_grad_kernel_impl.h" -PD_REGISTER_KERNEL( - unfold_grad, GPU, ALL_LAYOUT, phi::UnfoldGradKernel, float, double) {} +PD_REGISTER_KERNEL(unfold_grad, + GPU, + ALL_LAYOUT, + phi::UnfoldGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/unfold_kernel.cu b/paddle/phi/kernels/gpu/unfold_kernel.cu index db0d259791b60..f816db028cbc1 100644 --- a/paddle/phi/kernels/gpu/unfold_kernel.cu +++ b/paddle/phi/kernels/gpu/unfold_kernel.cu @@ -15,7 +15,15 @@ #include "paddle/phi/kernels/unfold_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unfold_kernel_impl.h" -PD_REGISTER_KERNEL(unfold, GPU, ALL_LAYOUT, phi::UnfoldKernel, float, double) {} +PD_REGISTER_KERNEL(unfold, + GPU, + ALL_LAYOUT, + phi::UnfoldKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu index 61efe8807643d..3b7f8a931278e 100644 --- a/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_inplace_grad_kernel.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/kernels/uniform_inplace_grad_kernel.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/full_kernel.h" @@ -41,4 +42,6 @@ PD_REGISTER_KERNEL(uniform_inplace_grad, ALL_LAYOUT, phi::UniformInplaceGradKernel, float, - double) {} + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu index 29bc2f4de5b86..5c3a886ad87e9 100644 --- a/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu +++ b/paddle/phi/kernels/gpu/uniform_inplace_kernel.cu @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "gflags/gflags.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/distribution_helper.h" #include "paddle/phi/kernels/funcs/index_impl.cu.h" @@ -72,8 +73,12 @@ void UniformInplaceKernel(const Context& ctx, funcs::distribution_and_transform(ctx, out, dist, trans); } else { // Use OP seed - auto func = - UniformGenerator(min, max, seed, diag_num, diag_step, diag_val); + auto func = UniformGenerator(static_cast(min), + static_cast(max), + seed, + diag_num, + diag_step, + static_cast(diag_val)); IndexKernel>(ctx, out, func); } } @@ -85,4 +90,6 @@ PD_REGISTER_KERNEL(uniform_inplace, ALL_LAYOUT, phi::UniformInplaceKernel, float, - double) {} + double, + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h index a72fb6062ceff..7b05b08eef5fb 100644 --- a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h @@ -18,13 +18,34 @@ limitations under the License. */ #include "glog/logging.h" +#include "paddle/phi/common/amp_type_traits.h" #include "paddle/phi/kernels/addmm_grad_kernel.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/eigen/common.h" #include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { +template +struct CopyOrScaleFunctor { + CopyOrScaleFunctor(const float scale, const T* x, T* output, int64_t numel) + : scale_(scale), x_(x), output_(output), numel_(numel) {} + + HOSTDEVICE void operator()(int64_t idx) const { + using MPType = typename phi::dtype::MPTypeTrait::Type; + const MPType mp_scale = static_cast(scale_); + const MPType mp_x = static_cast(x_[idx]); + output_[idx] = static_cast(mp_scale * mp_x); + } + + private: + const float scale_; + const T* x_; + T* output_; + int64_t numel_; +}; + template ::Type; + bool is_float16_or_bfloat16 = false; + if (std::is_same::value || + std::is_same::value) { + is_float16_or_bfloat16 = true; + } + auto in_dims = input.dims(); if (input.dims().size() == 1) { in_dims = {1, input.dims()[0]}; @@ -65,6 +93,7 @@ void AddmmGradKernel(const Context& dev_ctx, } auto blas = funcs::GetBlas(dev_ctx); + auto mt_blas = funcs::GetBlas(dev_ctx); if (input_grad) { dev_ctx.template Alloc(input_grad); total_elems = in_dims[0] * in_dims[1]; @@ -78,19 +107,60 @@ void AddmmGradKernel(const Context& dev_ctx, Array2(input_grad->dims()[0], input_grad->dims()[1]); if (row_compress && col_compress) { - eigen_dinput.device(place) = - eigen_dout.sum().eval().reshape(eigen_dinput_shape); + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum().eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum() + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } } else if (row_compress) { - eigen_dinput.device(place) = - eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape); + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array1(0)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } } else if (col_compress) { - eigen_dinput.device(place) = - eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape); + if (!is_float16_or_bfloat16) { + eigen_dinput.device(place) = + eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape); + } else { + eigen_dinput.device(place) = eigen_dout.template cast() + .sum(Array1(1)) + .eval() + .reshape(eigen_dinput_shape) + .template cast(); + } } else { - blas.VCOPY(total_elems, out_grad.data(), input_grad->data()); + // The VCOPY does not support the float16, bfloat16 + if (!is_float16_or_bfloat16) { + mt_blas.VCOPY( + total_elems, out_grad.data(), input_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + CopyOrScaleFunctor functor( + 1, out_grad.data(), input_grad->data(), total_elems); + for_range(functor); + } } - blas.SCAL(total_elems, beta, input_grad->data()); + // The SCAL does not support the float16, bfloat16 + if (!is_float16_or_bfloat16) { + mt_blas.SCAL(total_elems, beta, input_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + CopyOrScaleFunctor functor( + beta, input_grad->data(), input_grad->data(), total_elems); + for_range(functor); + } if (input.dims().size() == 1) { input_grad->Resize(input.dims()); @@ -101,14 +171,28 @@ void AddmmGradKernel(const Context& dev_ctx, total_elems = x.dims()[0] * x.dims()[1]; // x_grad = out_grad * y'. x_grad: M x K, out_grad : M x N, y : K x N blas.MatMul(out_grad, false, y, true, x_grad); - blas.SCAL(total_elems, alpha, x_grad->data()); + if (!is_float16_or_bfloat16) { + mt_blas.SCAL(total_elems, alpha, x_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + CopyOrScaleFunctor functor( + alpha, x_grad->data(), x_grad->data(), total_elems); + for_range(functor); + } } if (y_grad) { dev_ctx.template Alloc(y_grad); total_elems = x.dims()[1] * y.dims()[1]; // y_grad = x' * out_grad. y_grad K x N, out_grad : M x N, x : M x K blas.MatMul(x, true, out_grad, false, y_grad); - blas.SCAL(total_elems, alpha, y_grad->data()); + if (!is_float16_or_bfloat16) { + mt_blas.SCAL(total_elems, alpha, y_grad->data()); + } else { + phi::funcs::ForRange for_range(dev_ctx, total_elems); + CopyOrScaleFunctor functor( + alpha, y_grad->data(), y_grad->data(), total_elems); + for_range(functor); + } } } diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h index c86cea80e47e8..957e02de6af62 100644 --- a/paddle/phi/kernels/impl/addmm_kernel_impl.h +++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h @@ -112,17 +112,19 @@ void AddmmKernel(const Context& dev_ctx, funcs::EigenBroadcast, T, 2>::Eval( place, eigen_out, eigen_input, bcast_dims); + T t_alpha = static_cast(alpha); + T t_beta = static_cast(beta); blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], - alpha, + t_alpha, x.data(), x_dims[1], y.data(), y_dims[1], - beta, + t_beta, out->data(), y_dims[1]); } diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu index 245137943d571..dcab3a0d90e67 100644 --- a/paddle/phi/kernels/kps/elementwise_kernel.cu +++ b/paddle/phi/kernels/kps/elementwise_kernel.cu @@ -117,6 +117,7 @@ PD_REGISTER_KERNEL(fmax, double, int, float16, + bfloat16, int64_t) {} PD_REGISTER_KERNEL(fmin, @@ -127,6 +128,7 @@ PD_REGISTER_KERNEL(fmin, double, int, float16, + bfloat16, int64_t) {} PD_REGISTER_KERNEL(maximum_raw, diff --git a/paddle/phi/kernels/kps/reduce_min_kernel.cu b/paddle/phi/kernels/kps/reduce_min_kernel.cu index 450fee16b4ca9..8ed9ec30c1920 100644 --- a/paddle/phi/kernels/kps/reduce_min_kernel.cu +++ b/paddle/phi/kernels/kps/reduce_min_kernel.cu @@ -36,6 +36,14 @@ void MinRawKernel(const Context& dev_ctx, #ifdef PADDLE_WITH_XPU_KP PD_REGISTER_KERNEL(min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float) {} #else -PD_REGISTER_KERNEL( - min_raw, KPS, ALL_LAYOUT, phi::MinRawKernel, float, double, int, int64_t) {} +PD_REGISTER_KERNEL(min_raw, + KPS, + ALL_LAYOUT, + phi::MinRawKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} #endif diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc index c4c58c8342e60..ff50e9d1077b0 100644 --- a/paddle/phi/kernels/reduce_min_kernel.cc +++ b/paddle/phi/kernels/reduce_min_kernel.cc @@ -39,7 +39,20 @@ void MinKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( min, CPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#if defined(PADDLE_WITH_CUDA) +PD_REGISTER_KERNEL(min, + GPU, + ALL_LAYOUT, + phi::MinKernel, + float, + double, + int, + int64_t, + phi::dtype::float16, + phi::dtype::bfloat16) {} +#endif + +#if defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL( min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {} #endif diff --git a/python/paddle/fluid/tests/unittests/eager_op_test.py b/python/paddle/fluid/tests/unittests/eager_op_test.py index 0bcb34f956b23..1be451d65e83b 100644 --- a/python/paddle/fluid/tests/unittests/eager_op_test.py +++ b/python/paddle/fluid/tests/unittests/eager_op_test.py @@ -2689,6 +2689,26 @@ def fluid_dtype_to_np_dtype(self, dtype): def np_value_to_fluid_value(input): return input + def cast_bf16_output(self, block, cast_inputs): + output_names = [] + for i in range(0, len(cast_inputs)): + cast_output = block.create_var( + dtype="float32", shape=cast_inputs[i].shape + ) + cast_op = block.append_op( + inputs={"X": cast_inputs[i]}, + outputs={"Out": cast_output}, + type="cast", + attrs={ + "in_dtype": core.VarDesc.VarType.BF16, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + cast_op.desc.infer_var_type(block.desc) + cast_op.desc.infer_shape(block.desc) + output_names.append(cast_output.name) + return output_names + def _get_gradient( self, input_to_check, @@ -2712,21 +2732,24 @@ def _get_gradient( if user_defined_grad_outputs is None: if self.dtype == np.uint16: cast_inputs = list(map(block.var, output_names)) - cast_outputs = block.create_var( - dtype="float32", shape=cast_inputs[0].shape - ) - cast_op = block.append_op( - inputs={"X": cast_inputs}, - outputs={"Out": cast_outputs}, - type="cast", - attrs={ - "in_dtype": core.VarDesc.VarType.BF16, - "out_dtype": core.VarDesc.VarType.FP32, - }, - ) - cast_op.desc.infer_var_type(block.desc) - cast_op.desc.infer_shape(block.desc) - output_names = [cast_outputs.name] + if self.op_type == "broadcast_tensors": + output_names = self.cast_bf16_output(block, cast_inputs) + else: + cast_outputs = block.create_var( + dtype="float32", shape=cast_inputs[0].shape + ) + cast_op = block.append_op( + inputs={"X": cast_inputs}, + outputs={"Out": cast_outputs}, + type="cast", + attrs={ + "in_dtype": core.VarDesc.VarType.BF16, + "out_dtype": core.VarDesc.VarType.FP32, + }, + ) + cast_op.desc.infer_var_type(block.desc) + cast_op.desc.infer_shape(block.desc) + output_names = [cast_outputs.name] loss = append_loss_ops(block, output_names) param_grad_list = append_backward( loss=loss, diff --git a/python/paddle/fluid/tests/unittests/test_addmm_op.py b/python/paddle/fluid/tests/unittests/test_addmm_op.py index 3041841cdf84f..66a86961e885d 100644 --- a/python/paddle/fluid/tests/unittests/test_addmm_op.py +++ b/python/paddle/fluid/tests/unittests/test_addmm_op.py @@ -15,11 +15,11 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle import fluid -from paddle.fluid import Program, program_guard +from paddle.fluid import Program, core, program_guard class TestAddMMOp(OpTest): @@ -27,7 +27,6 @@ class TestAddMMOp(OpTest): def setUp(self): self.op_type = "addmm" self.python_api = paddle.addmm - self.dtype = np.float64 self.init_dtype_type() self.inputs = { 'Input': np.random.random((100, 1)).astype(self.dtype), @@ -40,7 +39,7 @@ def setUp(self): } def init_dtype_type(self): - pass + self.dtype = np.float64 def test_check_output(self): self.check_output() @@ -58,6 +57,62 @@ def test_check_grad_input(self): self.check_grad(['Input'], 'Out', no_grad_set=None) +class TestAddMMFP16Op(TestAddMMOp): + def init_dtype_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output(atol=1e-2) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestAddMMBF16Op(OpTest): + def setUp(self): + self.op_type = "addmm" + self.python_api = paddle.addmm + self.init_dtype_type() + self.inputs = { + 'Input': np.random.random((100, 1)).astype(self.np_dtype), + 'X': np.random.random((100, 10)).astype(self.np_dtype), + 'Y': np.random.random((10, 20)).astype(self.np_dtype), + } + self.outputs = { + 'Out': self.inputs['Input'] + + np.dot(self.inputs['X'], self.inputs['Y']) + } + + self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input']) + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def init_dtype_type(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['Input', 'X', 'Y'], 'Out') + + def test_check_grad_x(self): + self.check_grad_with_place(self.place, ['X'], 'Out', no_grad_set=None) + + def test_check_grad_y(self): + self.check_grad_with_place(self.place, ['Y'], 'Out', no_grad_set=None) + + def test_check_grad_input(self): + self.check_grad_with_place( + self.place, ['Input'], 'Out', no_grad_set=None + ) + + class TestAddMMOpError(unittest.TestCase): # test error def test_errors(self): diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py index d2af1933469aa..be37ff0157898 100644 --- a/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py +++ b/python/paddle/fluid/tests/unittests/test_broadcast_tensors_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle.fluid import core @@ -43,7 +43,7 @@ def find_output_shape(input_list): return list(reversed(output_shape)) -def make_inputs_outputs(input_shapes, dtype): +def make_inputs_outputs(input_shapes, dtype, is_bfloat16=False): """Automatically generate formatted inputs and outputs from input_shapes""" input_list = [ np.random.random(shape).astype(dtype) for shape in input_shapes @@ -53,6 +53,16 @@ def make_inputs_outputs(input_shapes, dtype): x + np.zeros(output_shape).astype(x.dtype) for x in input_list ] + if is_bfloat16: + input_list = [ + convert_float_to_uint16(input_list[i]) + for i in range(len(input_list)) + ] + output_list = [ + convert_float_to_uint16(output_list[i]) + for i in range(len(output_list)) + ] + output_formatted = { "Out": [(f"out{i}", output_list[i]) for i in range(len(output_list))] } @@ -63,24 +73,24 @@ def make_inputs_outputs(input_shapes, dtype): return input_formatted, output_formatted -def gen_rank_diff_test(dtype): +def gen_rank_diff_test(dtype, is_bfloat16=False): input_shapes = [(2, 60, 1), (6, 2, 1, 10)] - return make_inputs_outputs(input_shapes, dtype) + return make_inputs_outputs(input_shapes, dtype, is_bfloat16) -def gen_no_broadcast_test(dtype): +def gen_no_broadcast_test(dtype, is_bfloat16=False): input_shapes = [(12, 1, 10, 1), (12, 1, 10, 1)] - return make_inputs_outputs(input_shapes, dtype) + return make_inputs_outputs(input_shapes, dtype, is_bfloat16) -def gen_mixed_tensors_test(dtype): +def gen_mixed_tensors_test(dtype, is_bfloat16=False): input_shapes = [(2, 60, 1), (2, 2, 1, 30), (1, 2, 60, 1)] - return make_inputs_outputs(input_shapes, dtype) + return make_inputs_outputs(input_shapes, dtype, is_bfloat16) -def gen_empty_tensors_test(dtype): +def gen_empty_tensors_test(dtype, is_bfloat16=False): input_shapes = [(0), (0), (0)] - return make_inputs_outputs(input_shapes, dtype) + return make_inputs_outputs(input_shapes, dtype, is_bfloat16) class TestCPUBroadcastTensorsOp(OpTest): @@ -125,7 +135,7 @@ def run_triple_in_test(self, test_func, args): def test_check_output(self): self.run_dual_test( self.check_output_with_place, - {"place": self.place, "atol": 1e-1}, + {"place": self.place}, ) def test_check_grad_normal(self): @@ -135,7 +145,6 @@ def test_check_grad_normal(self): "place": self.place, "inputs_to_check": ['x0', 'x1'], "output_names": ['out0', 'out1'], - "max_relative_error": 0.05, }, ) self.run_triple_in_test( @@ -144,7 +153,6 @@ def test_check_grad_normal(self): "place": self.place, "inputs_to_check": ['x0', 'x1', 'x2'], "output_names": ['out0', 'out1', "out2"], - "max_relative_error": 0.05, }, ) @@ -152,14 +160,77 @@ def test_check_grad_normal(self): @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) -class TestCUDABroadcastTensorsOp(TestCPUBroadcastTensorsOp): +class TestBroadcastTensorsFP16Op(TestCPUBroadcastTensorsOp): def set_place(self): self.place = core.CUDAPlace(0) def set_dtypes(self): - self.dtypes = ['float64'] - if core.is_float16_supported(self.place): - self.dtypes.append('float16') + self.dtypes = ['float16'] + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestBroadcastTensorsBF16Op(OpTest): + def setUp(self): + self.op_type = "broadcast_tensors" + self.dtype = np.uint16 + self.np_dtype = "float32" + self.use_mkldnn = False + self.attrs = {'use_mkldnn': self.use_mkldnn} + self.test_gen_func_list = [ + gen_rank_diff_test, + gen_no_broadcast_test, + gen_mixed_tensors_test, + ] + self.python_api = paddle.broadcast_tensors + self.place = core.CUDAPlace(0) + + def run_dual_test(self, test_func, args): + for gen_func in self.test_gen_func_list: + self.inputs, self.outputs = gen_func(self.np_dtype, True) + if len(self.outputs["Out"]) < 3: + self.python_out_sig = [ + f"out{i}" for i in range(len(self.outputs["Out"])) + ] + test_func(**args) + + def run_triple_in_test(self, test_func, args): + self.inputs, self.outputs = self.test_gen_func_list[2]( + self.np_dtype, True + ) + self.python_out_sig = [ + f"out{i}" for i in range(len(self.outputs["Out"])) + ] + test_func(**args) + + def test_check_output(self): + self.run_dual_test( + self.check_output_with_place, + {"place": self.place}, + ) + + def test_check_grad_normal(self): + self.run_dual_test( + self.check_grad_with_place, + { + "place": self.place, + "inputs_to_check": ['x0', 'x1'], + "output_names": ['out0', 'out1'], + "check_dygraph": False, + }, + ) + self.run_triple_in_test( + self.check_grad_with_place, + { + "place": self.place, + "inputs_to_check": ['x0', 'x1', 'x2'], + "output_names": ['out0', 'out1', 'out2'], + "check_dygraph": False, + }, + ) class TestBroadcastTensorsAPI(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py index 12c161ca9e56f..da78ee681aac0 100644 --- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 from op import Operator import paddle @@ -102,6 +102,48 @@ def initTestCase(self): self.max_norm = 1.0 +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestClipByNormBF16Op(OpTest): + def setUp(self): + self.max_relative_error = 0.006 + self.python_api = clip.clip_by_norm + self.init_dtype() + self.initTestCase() + input = np.random.random(self.shape).astype(self.np_dtype) + input[np.abs(input) < self.max_relative_error] = 0.5 + self.op_type = "clip_by_norm" + self.inputs = { + 'X': input, + } + self.attrs = {} + self.attrs['max_norm'] = self.max_norm + norm = np.sqrt(np.sum(np.square(input))) + if norm > self.max_norm: + output = self.max_norm * input / norm + else: + output = input + self.outputs = {'Out': output} + + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def initTestCase(self): + self.shape = (100,) + self.max_norm = 1.0 + + def init_dtype(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + + class TestClipByNormOpWithSelectedRows(unittest.TestCase): def check_with_place(self, place): self.config_test_case() diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 0b843663827c1..d2b6545a81a4c 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -15,10 +15,16 @@ import unittest import numpy as np -from eager_op_test import OpTest, paddle_static_guard +from eager_op_test import ( + OpTest, + convert_float_to_uint16, + get_numeric_gradient, + paddle_static_guard, +) import paddle from paddle.fluid import core +from paddle.fluid.tests.unittests.testsuite import create_op def conv3d_forward_naive( @@ -179,6 +185,77 @@ def init_kernel_type(self): globals()[cls_name] = TestCUDNNCase +def create_test_cudnn_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestConv3DCUDNNBF16(parent): + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op( + scope, self.op_type, self.inputs, self.outputs, self.attrs + ) + return get_numeric_gradient( + place, scope, op, self.inputs_fp32, check_name, ['Output'] + ) + + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place( + place, check_dygraph=(not self.use_mkldnn) + ) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Input') + + self.check_grad_with_place( + place, + ['Input'], + 'Output', + no_grad_set={'Filter'}, + check_dygraph=(not self.use_mkldnn), + user_defined_grads=[numeric_grads], + ) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Filter') + + self.check_grad_with_place( + place, + ['Filter'], + 'Output', + no_grad_set={'Input'}, + check_dygraph=(not self.use_mkldnn), + user_defined_grads=[numeric_grads], + ) + + def test_check_grad(self): + place = core.CUDAPlace(0) + numeric_input_grads = self.get_numeric_grad(place, 'Input') + numeric_fliter_grads = self.get_numeric_grad(place, 'Filter') + + self.check_grad_with_place( + place, + {'Input', 'Filter'}, + 'Output', + user_defined_grads=[numeric_input_grads, numeric_fliter_grads], + check_dygraph=(not self.use_mkldnn), + ) + + cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP") + TestConv3DCUDNNBF16.__name__ = cls_name + globals()[cls_name] = TestConv3DCUDNNBF16 + + def create_test_padding_SAME_class(parent): class TestPaddingSMAECase(parent): def init_paddings(self): @@ -323,19 +400,37 @@ def setUp(self): 'dilations': self.dilations, } - input = np.random.random(self.input_size).astype(self.dtype) - filter = np.random.random(self.filter_size).astype(self.dtype) + if self.is_bfloat16_op(): + input = np.random.random(self.input_size).astype(np.float32) + filter = np.random.random(self.filter_size).astype(np.float32) + else: + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) + output = conv3d_forward_naive( input, filter, self.groups, conv3d_param, - ).astype(self.dtype) + ) + + if self.is_bfloat16_op(): + output = convert_float_to_uint16(output) + self.inputs = { + 'Input': convert_float_to_uint16(input), + 'Filter': convert_float_to_uint16(filter), + } + self.inputs_fp32 = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter), + } + else: + output = output.astype(self.dtype) + self.inputs = { + 'Input': OpTest.np_dtype_to_fluid_dtype(input), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter), + } - self.inputs = { - 'Input': OpTest.np_dtype_to_fluid_dtype(input), - 'Filter': OpTest.np_dtype_to_fluid_dtype(filter), - } self.attrs = { 'strides': self.stride, 'paddings': self.pad, @@ -358,8 +453,6 @@ def test_check_output(self): ) def test_check_grad(self): - if self.dtype == np.float16: - return place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_grad_with_place( @@ -371,8 +464,7 @@ def test_check_grad(self): ) def test_check_grad_no_filter(self): - if self.dtype == np.float16: - return + place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_grad_with_place( @@ -385,8 +477,7 @@ def test_check_grad_no_filter(self): ) def test_check_grad_no_input(self): - if self.dtype == np.float16: - return + place = core.CUDAPlace(0) if self.has_cudnn() else core.CPUPlace() # TODO(wangzhongpu): support mkldnn op in dygraph mode self.check_grad_with_place( @@ -617,6 +708,14 @@ def init_kernel_type(self): self.dtype = np.float32 if core.is_compiled_with_rocm() else np.float64 +# ----------------Conv3DCUDNN bf16---------------- +create_test_cudnn_bf16_class(TestConv3DOp) +create_test_cudnn_bf16_class(TestWithGroup1) +create_test_cudnn_bf16_class(TestWithGroup2) +create_test_cudnn_bf16_class(TestWith1x1) +create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1) + + # ---- test asymmetric padding ---- @@ -1114,4 +1213,5 @@ def run_8(): if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py index 22e22d9b2f66a..c0814754cc231 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py @@ -19,11 +19,25 @@ import paddle paddle.enable_static() -from eager_op_test import OpTest +from eager_op_test import OpTest, copy_bits_from_float_to_uint16 from paddle.fluid import core +def convert_float_to_uint16(float_list, data_format="NCHW"): + if data_format == "NHWC": + float_list = np.transpose(float_list, [0, 4, 1, 2, 3]) + + new_output = [] + for x in np.nditer(float_list): + new_output.append(np.uint16(copy_bits_from_float_to_uint16(x))) + new_output = np.reshape(new_output, float_list.shape).view(np.uint16) + + if data_format == "NHWC": + new_output = np.transpose(new_output, [0, 2, 3, 4, 1]) + return new_output + + def conv3dtranspose_forward_naive(input_, filter_, attrs): padding_algorithm = attrs['padding_algorithm'] if padding_algorithm not in ["SAME", "VALID", "EXPLICIT"]: @@ -134,6 +148,86 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride): return out +def create_test_cudnn_fp16_class(parent, grad_check=True): + @unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + ) + class TestConv3DTransposeCUDNNFP16(parent): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place, atol=2e-2) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Input'], 'Output', no_grad_set={'Filter'} + ) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place) and grad_check: + self.check_grad_with_place( + place, ['Filter'], 'Output', no_grad_set={'Input'} + ) + + cls_name = "{}_{}".format(parent.__name__, "CUDNNFP16OP") + TestConv3DTransposeCUDNNFP16.__name__ = cls_name + globals()[cls_name] = TestConv3DTransposeCUDNNFP16 + + +def create_test_cudnn_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestConv3DTransposeCUDNNBF16(parent): + def init_kernel_type(self): + self.use_cudnn = True + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + {'Input', 'Filter'}, + 'Output', + ) + + def test_check_grad_no_filter(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['Input'], + 'Output', + no_grad_set={'Filter'}, + ) + + def test_check_grad_no_input(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['Filter'], + 'Output', + no_grad_set={'Input'}, + ) + + cls_name = "{}_{}".format(parent.__name__, "CUDNNBF16OP") + TestConv3DTransposeCUDNNBF16.__name__ = cls_name + globals()[cls_name] = TestConv3DTransposeCUDNNBF16 + + def conv3d_transpose_wrapper( x, weight, @@ -172,12 +266,16 @@ def setUp(self): self.pad = [0, 0, 0] self.padding_algorithm = "EXPLICIT" self.init_op_type() + self.init_kernel_type() self.init_test_case() - input_ = np.random.random(self.input_size).astype("float32") - filter_ = np.random.random(self.filter_size).astype("float32") + if self.is_bfloat16_op(): + input = np.random.random(self.input_size).astype(np.float32) + filter = np.random.random(self.filter_size).astype(np.float32) + else: + input = np.random.random(self.input_size).astype(self.dtype) + filter = np.random.random(self.filter_size).astype(self.dtype) - self.inputs = {'Input': input_, 'Filter': filter_} self.attrs = { 'strides': self.stride, 'paddings': self.pad, @@ -189,9 +287,21 @@ def setUp(self): } output = conv3dtranspose_forward_naive( - input_, filter_, self.attrs + input, filter, self.attrs ).astype("float32") + if self.is_bfloat16_op(): + self.inputs = { + 'Input': convert_float_to_uint16(input), + 'Filter': convert_float_to_uint16(filter), + } + else: + self.inputs = { + 'Input': input, + 'Filter': filter, + } + output = output.astype(self.dtype) + self.outputs = {'Output': output} def test_check_output(self): @@ -264,6 +374,9 @@ def init_op_type(self): self.op_type = "conv3d_transpose" self.python_api = conv3d_transpose_wrapper + def init_kernel_type(self): + self.dtype = np.float32 + class TestWithSymmetricPad(TestConv3DTransposeOp): def init_test_case(self): @@ -596,6 +709,30 @@ def init_op_type(self): self.python_api = conv3d_transpose_wrapper +# ----------------Conv3DTransposeCUDNN fp16---------------- +create_test_cudnn_fp16_class(TestConv3DTransposeOp) +create_test_cudnn_fp16_class(TestWithSymmetricPad) +create_test_cudnn_fp16_class(TestWithAsymmetricPad) +create_test_cudnn_fp16_class(TestWithSAMEPad) +create_test_cudnn_fp16_class(TestWithVALIDPad) +create_test_cudnn_fp16_class(TestWithStride) +create_test_cudnn_fp16_class(TestWithGroups) +create_test_cudnn_fp16_class(TestWithDilation) +create_test_cudnn_fp16_class(Test_NHWC) + + +# ----------------Conv3DTransposeCUDNN bf16---------------- +create_test_cudnn_bf16_class(TestConv3DTransposeOp) +create_test_cudnn_bf16_class(TestWithSymmetricPad) +create_test_cudnn_bf16_class(TestWithAsymmetricPad) +create_test_cudnn_bf16_class(TestWithSAMEPad) +create_test_cudnn_bf16_class(TestWithVALIDPad) +create_test_cudnn_bf16_class(TestWithStride) +create_test_cudnn_bf16_class(TestWithGroups) +create_test_cudnn_bf16_class(TestWithDilation) +create_test_cudnn_bf16_class(Test_NHWC) + + class TestConv3dTranspose(unittest.TestCase): def error_weight_input(self): array = np.array([1], dtype=np.float32) diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py index b51cdd9b1087a..54d31a7bbd22b 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py @@ -15,7 +15,11 @@ import unittest import numpy as np -from test_conv3d_transpose_op import TestConv3DTransposeOp +from test_conv3d_transpose_op import ( + TestConv3DTransposeOp, + create_test_cudnn_bf16_class, + create_test_cudnn_fp16_class, +) import paddle from paddle import fluid @@ -84,6 +88,22 @@ def init_test_case(self): self.data_format = 'NHWC' +# ----------------Conv3DTransposeCUDNN fp16---------------- +create_test_cudnn_fp16_class(TestWithSymmetricPad_NHWC) +create_test_cudnn_fp16_class(TestWithAsymmetricPad_NHWC) +create_test_cudnn_fp16_class(TestWithGroups_NHWC) +create_test_cudnn_fp16_class(TestWithStride_NHWC) +create_test_cudnn_fp16_class(TestWithDilation_NHWC) + + +# ----------------Conv3DTransposeCUDNN bf16---------------- +create_test_cudnn_bf16_class(TestWithSymmetricPad_NHWC) +create_test_cudnn_bf16_class(TestWithAsymmetricPad_NHWC) +create_test_cudnn_bf16_class(TestWithGroups_NHWC) +create_test_cudnn_bf16_class(TestWithStride_NHWC) +create_test_cudnn_bf16_class(TestWithDilation_NHWC) + + class TestConv3DTransposeAPI(unittest.TestCase): def test_case1(self): data1 = paddle.static.data( diff --git a/python/paddle/fluid/tests/unittests/test_fmax_op.py b/python/paddle/fluid/tests/unittests/test_fmax_op.py index 19417919df248..0271854aebb72 100644 --- a/python/paddle/fluid/tests/unittests/test_fmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_fmax_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle.fluid import core @@ -241,5 +241,34 @@ def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out') +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and not support the bfloat16", +) +class TestFmaxBF16OP(OpTest): + def setUp(self): + self.op_type = "elementwise_fmax" + self.python_api = paddle.fmax + self.dtype = np.uint16 + x = np.random.uniform(0.1, 1, [13, 17]).astype("float32") + sgn = np.random.choice([-1, 1], [13, 17]).astype("float32") + y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32") + out = np.fmax(x, y) + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y), + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fmin_op.py b/python/paddle/fluid/tests/unittests/test_fmin_op.py index 19e43fa3d1d6c..1956d5b4fc433 100644 --- a/python/paddle/fluid/tests/unittests/test_fmin_op.py +++ b/python/paddle/fluid/tests/unittests/test_fmin_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle.fluid import core @@ -243,6 +243,35 @@ def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out') +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and not support the bfloat16", +) +class TestFminBF16OP(OpTest): + def setUp(self): + self.op_type = "elementwise_fmin" + self.python_api = paddle.fmin + self.dtype = np.uint16 + x = np.random.uniform(1, 1, [13, 17]).astype("float32") + sgn = np.random.choice([-1, 1], [13, 17]).astype("float32") + y = x + sgn * np.random.uniform(1, 1, [13, 17]).astype("float32") + out = np.fmin(x, y) + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y), + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + if __name__ == "__main__": paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_index_add_op.py b/python/paddle/fluid/tests/unittests/test_index_add_op.py index 6b96493cfcf78..0e9b15f45438f 100644 --- a/python/paddle/fluid/tests/unittests/test_index_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_index_add_op.py @@ -15,10 +15,10 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle -from paddle.fluid import Program +from paddle.fluid import Program, core def compute_index_add_ref( @@ -99,6 +99,69 @@ def test_check_grad_normal(self): self.check_grad(['X', 'AddValue'], 'Out') +class TestIndexAddFP16Op(TestIndexAddOp): + def init_dtype_type(self): + self.axis = 0 + self.x_type = np.float16 + self.index_type = np.int64 + self.x_shape = (101, 3) + self.index_size = 3 + self.add_value_shape = (3, 3) + self.dtype = np.float16 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestIndexAddBF16Op(OpTest): + def setUp(self): + self.python_api = raw_index_add + self.op_type = "index_add" + self.init_dtype_type() + index_np = np.random.randint( + low=0, high=self.x_shape[self.axis], size=self.index_size + ) + x_np = np.random.random(self.x_shape).astype(self.x_type) + add_value_np = np.random.random(self.add_value_shape).astype( + self.x_type + ) + + self.inputs = { + 'X': convert_float_to_uint16(x_np), + 'Index': index_np, + 'AddValue': convert_float_to_uint16(add_value_np), + } + self.attrs = {'axis': self.axis} + out = compute_index_add_ref( + self.axis, + self.x_shape, + x_np, + self.add_value_shape, + add_value_np, + self.index_size, + index_np, + ) + self.outputs = {'Out': convert_float_to_uint16(out)} + self.place = core.CUDAPlace(0) + + def init_dtype_type(self): + self.axis = 0 + self.x_type = np.float32 + self.index_type = np.int64 + self.x_shape = (101, 3) + self.index_size = 3 + self.add_value_shape = (3, 3) + self.dtype = np.uint16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'AddValue'], 'Out') + + class TestIndexAddAPI(unittest.TestCase): def setUp(self): self.setType() diff --git a/python/paddle/fluid/tests/unittests/test_index_sample_op.py b/python/paddle/fluid/tests/unittests/test_index_sample_op.py index 33491720af9c5..d9551e0bea8d8 100755 --- a/python/paddle/fluid/tests/unittests/test_index_sample_op.py +++ b/python/paddle/fluid/tests/unittests/test_index_sample_op.py @@ -15,10 +15,11 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle import fluid +from paddle.fluid import core class TestIndexSampleOp(OpTest): @@ -121,6 +122,49 @@ def config(self): self.index_type = "int64" +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestIndexSampleBF16Op(OpTest): + def setUp(self): + self.op_type = "index_sample" + self.python_api = paddle.index_sample + self.config() + xnp = np.random.random(self.x_shape).astype(self.x_type) + indexnp = np.random.randint( + low=0, high=self.x_shape[1], size=self.index_shape + ).astype(self.index_type) + self.inputs = {'X': xnp, 'Index': indexnp} + index_array = [] + for i in range(self.index_shape[0]): + for j in indexnp[i]: + index_array.append(xnp[i, j]) + index_array = np.array(index_array).astype(self.x_type) + out = np.reshape(index_array, self.index_shape) + self.outputs = {'Out': out} + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def config(self): + """ + For multi-dimension input + """ + self.x_shape = (10, 20) + self.x_type = "float32" + self.dtype = np.uint16 + self.index_shape = (10, 10) + self.index_type = "int32" + + class TestIndexSampleShape(unittest.TestCase): def test_shape(self): paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/test_logspace.py b/python/paddle/fluid/tests/unittests/test_logspace.py index 19461fc938f20..2982a12968c95 100644 --- a/python/paddle/fluid/tests/unittests/test_logspace.py +++ b/python/paddle/fluid/tests/unittests/test_logspace.py @@ -15,9 +15,10 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle +from paddle.fluid import core class TestLogspaceOpCommonCase(OpTest): @@ -41,6 +42,54 @@ def test_check_output(self): self.check_output() +class TestLogspaceFP16Op(TestLogspaceOpCommonCase): + def init_data(self): + self.dtype = np.float16 + self.inputs = { + 'Start': np.array([0]).astype(self.dtype), + 'Stop': np.array([10]).astype(self.dtype), + 'Num': np.array([11]).astype('int32'), + 'Base': np.array([2]).astype(self.dtype), + } + self.attrs = {'dtype': int(paddle.float16)} + self.outputs = {'Out': np.power(2, np.arange(0, 11)).astype(self.dtype)} + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestLogspaceBF16Op(OpTest): + def setUp(self): + self.op_type = "logspace" + self.python_api = paddle.logspace + self.init_data() + + def init_data(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + self.inputs = { + 'Start': np.array([0]).astype(self.np_dtype), + 'Stop': np.array([10]).astype(self.np_dtype), + 'Num': np.array([11]).astype('int32'), + 'Base': np.array([2]).astype(self.np_dtype), + } + self.attrs = {'dtype': int(paddle.bfloat16)} + self.outputs = { + 'Out': np.power(2, np.arange(0, 11)).astype(self.np_dtype) + } + + self.inputs["Start"] = convert_float_to_uint16(self.inputs["Start"]) + self.inputs["Stop"] = convert_float_to_uint16(self.inputs["Stop"]) + self.inputs["Base"] = convert_float_to_uint16(self.inputs["Base"]) + self.outputs["Out"] = convert_float_to_uint16(self.outputs["Out"]) + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + class TestLogspaceOpReverseCase(TestLogspaceOpCommonCase): def init_data(self): dtype = 'float32' diff --git a/python/paddle/fluid/tests/unittests/test_mul_op.py b/python/paddle/fluid/tests/unittests/test_mul_op.py index 1e2b909d90dab..9ce95d354d7af 100644 --- a/python/paddle/fluid/tests/unittests/test_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_mul_op.py @@ -20,7 +20,7 @@ from paddle.fluid import core sys.path.append("..") -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 class TestMulOp(OpTest): @@ -114,14 +114,14 @@ def test_check_grad_ignore_y(self): @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) -class TestFP16MulOp1(TestMulOp): +class TestMulFP16Op1(TestMulOp): def init_dtype_type(self): self.dtype = np.float16 def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_output_with_place(place, atol=1e-1, check_dygraph=False) + self.check_output_with_place(place, check_dygraph=False) def test_check_grad_normal(self): place = core.CUDAPlace(0) @@ -130,7 +130,6 @@ def test_check_grad_normal(self): place, ['X', 'Y'], 'Out', - max_relative_error=0.5, check_dygraph=False, ) @@ -141,7 +140,6 @@ def test_check_grad_ingore_x(self): place, ['Y'], 'Out', - max_relative_error=0.5, no_grad_set=set("X"), check_dygraph=False, ) @@ -153,7 +151,6 @@ def test_check_grad_ingore_y(self): place, ['X'], 'Out', - max_relative_error=0.5, no_grad_set=set('Y'), check_dygraph=False, ) @@ -162,14 +159,14 @@ def test_check_grad_ingore_y(self): @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) -class TestFP16MulOp2(TestMulOp2): +class TestMulFP16Op2(TestMulOp2): def init_dtype_type(self): self.dtype = np.float16 def test_check_output(self): place = core.CUDAPlace(0) if core.is_float16_supported(place): - self.check_output_with_place(place, atol=2e-1, check_dygraph=False) + self.check_output_with_place(place, check_dygraph=False) def test_check_grad_normal(self): place = core.CUDAPlace(0) @@ -178,7 +175,6 @@ def test_check_grad_normal(self): place, ['X', 'Y'], 'Out', - max_relative_error=0.9, check_dygraph=False, ) @@ -189,7 +185,6 @@ def test_check_grad_ingore_x(self): place, ['Y'], 'Out', - max_relative_error=0.5, no_grad_set=set("X"), check_dygraph=False, ) @@ -201,11 +196,120 @@ def test_check_grad_ingore_y(self): place, ['X'], 'Out', - max_relative_error=0.9, no_grad_set=set('Y'), check_dygraph=False, ) +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestMulBF16Op1(OpTest): + def setUp(self): + self.op_type = "mul" + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((20, 5)).astype(self.np_dtype), + 'Y': np.random.random((5, 21)).astype(self.np_dtype), + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def init_dtype_type(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], 'Out', check_dygraph=False + ) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + no_grad_set=set("X"), + check_dygraph=False, + ) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + no_grad_set=set('Y'), + check_dygraph=False, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestMulBF16Op2(TestMulBF16Op1): + def setUp(self): + self.op_type = "mul" + self.init_dtype_type() + self.inputs = { + 'X': np.random.random((3, 4, 2, 9)).astype(self.np_dtype), + 'Y': np.random.random((3, 6, 1, 2, 3)).astype(self.np_dtype), + } + self.attrs = { + 'x_num_col_dims': 2, + 'y_num_col_dims': 2, + } + result = np.dot( + self.inputs['X'].reshape(3 * 4, 2 * 9), + self.inputs['Y'].reshape(3 * 6, 1 * 2 * 3), + ) + result = result.reshape(3, 4, 1, 2, 3) + self.outputs = {'Out': result} + + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.inputs['Y'] = convert_float_to_uint16(self.inputs['Y']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def test_check_grad_normal(self): + self.check_grad_with_place( + self.place, + ['X', 'Y'], + 'Out', + numeric_grad_delta=0.02, + check_dygraph=False, + ) + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, + ['Y'], + 'Out', + numeric_grad_delta=0.02, + no_grad_set=set("X"), + check_dygraph=False, + ) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + numeric_grad_delta=0.02, + no_grad_set=set('Y'), + check_dygraph=False, + ) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index a5555657c7f15..dc6f5c4810cf9 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -15,10 +15,11 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 from numpy.linalg import multi_dot import paddle +from paddle.fluid import core paddle.enable_static() @@ -49,6 +50,53 @@ def test_check_grad(self): self.check_grad(['x1'], 'Out') +class TestMultiDotFP16Op(TestMultiDotOp): + def get_dtype(self): + return "float16" + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestMultiDotBF16Op(OpTest): + def setUp(self): + self.op_type = "multi_dot" + self.python_api = paddle.linalg.multi_dot + self.dtype = self.get_dtype() + self.get_inputs_and_outputs() + self.place = core.CUDAPlace(0) + + def get_dtype(self): + self.np_dtype = "float32" + return np.uint16 + + def get_inputs_and_outputs(self): + self.A = np.random.random((2, 8)).astype(self.np_dtype) + self.B = np.random.random((8, 4)).astype(self.np_dtype) + self.inputs = { + 'X': [ + ('x0', convert_float_to_uint16(self.A)), + ('x1', convert_float_to_uint16(self.B)), + ] + } + self.outputs = { + 'Out': convert_float_to_uint16(multi_dot([self.A, self.B])) + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['x0'], 'Out', numeric_grad_delta=0.01 + ) + self.check_grad_with_place( + self.place, ['x1'], 'Out', numeric_grad_delta=0.01 + ) + + # (A*B)*C class TestMultiDotOp3Mat(TestMultiDotOp): def get_inputs_and_outputs(self): diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py index d8d61f4fb2904..16d1f356537bc 100644 --- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py @@ -15,9 +15,16 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import ( + OpTest, + convert_float_to_uint16, + convert_uint16_to_float, + get_numeric_gradient, +) import paddle +from paddle.fluid import core +from paddle.fluid.tests.unittests.testsuite import create_op def adaptive_start_index(index, input_size, output_size): @@ -149,9 +156,18 @@ def setUp(self): self.init_test_case() self.init_global() self.init_adaptive() + self.init_dtype() + + if self.is_bfloat16_op(): + input = np.random.random(self.shape).astype(np.float32) + input = convert_uint16_to_float( + convert_float_to_uint16(np.round(input * 100.0, 2)) + ) + + else: + input = np.random.random(self.shape).astype(self.dtype) + input = np.round(input * 100.0, 2) - input = np.random.random(self.shape).astype("float64") - input = np.round(input * 100.0, 2) output, mask = self.pool_forward_naive( input, self.ksize, @@ -160,8 +176,11 @@ def setUp(self): self.global_pool, self.adaptive, ) - output = output.astype("float64") mask = mask.astype("int32") + if self.is_bfloat16_op(): + output = output.astype(np.float32) + else: + output = output.astype(self.dtype) self.attrs = { 'strides': self.strides, @@ -171,8 +190,20 @@ def setUp(self): 'adaptive': self.adaptive, } - self.inputs = {'X': input} - self.outputs = {'Out': output, "Mask": mask} + if self.is_bfloat16_op(): + self.inputs = {'X': convert_float_to_uint16(input)} + self.outputs = { + 'Out': convert_float_to_uint16(output), + "Mask": mask, + } + self.inputs_fp32 = {'X': input} + + else: + self.inputs = {'X': input} + self.outputs = {'Out': output, "Mask": mask} + + def init_dtype(self): + self.dtype = np.float64 def test_check_output(self): self.check_output() @@ -220,9 +251,90 @@ def init_global(self): self.global_pool = False -# ----------------max_pool2d_with_index---------------- +class TestCastAdaptive3d(TestMaxPoolWithIndex_Op): + def init_adaptive(self): + self.adaptive = True +# ----------------max_pool3d_with_index_fp16---------------- +def create_test_fp16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + ) + class TestMaxPool3dFP16(parent): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place(place, {'X'}, ['Out']) + + cls_name = "{}_{}".format(parent.__name__, "FP16OP") + TestMaxPool3dFP16.__name__ = cls_name + globals()[cls_name] = TestMaxPool3dFP16 + + +create_test_fp16_class(TestMaxPoolWithIndex_Op) +create_test_fp16_class(TestCase1) +create_test_fp16_class(TestCase2) +create_test_fp16_class(TestCase3) +create_test_fp16_class(TestCastAdaptive3d) + + +# ----------------max_pool3d_with_index_bf16---------------- +def create_test_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestMaxPool3dBF16(parent): + def init_dtype(self): + self.dtype = np.uint16 + + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op( + scope, self.op_type, self.inputs, self.outputs, self.attrs + ) + return get_numeric_gradient( + place, scope, op, self.inputs_fp32, check_name, ['Out'] + ) + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_bfloat16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'X') + if core.is_bfloat16_supported(place): + self.check_grad_with_place( + place, {'X'}, ['Out'], user_defined_grads=[numeric_grads] + ) + + cls_name = "{}_{}".format(parent.__name__, "BF16OP") + TestMaxPool3dBF16.__name__ = cls_name + globals()[cls_name] = TestMaxPool3dBF16 + + +create_test_bf16_class(TestMaxPoolWithIndex_Op) +create_test_bf16_class(TestCase1) +create_test_bf16_class(TestCase2) +create_test_bf16_class(TestCase3) +create_test_bf16_class(TestCastAdaptive3d) + + +# ----------------max_pool2d_with_index---------------- def max_pool2d_with_index_wapper( x, kernel_size=[], @@ -279,9 +391,82 @@ def init_adaptive(self): self.adaptive = True -class TestCastAdaptive3d(TestMaxPoolWithIndex_Op): - def init_adaptive(self): - self.adaptive = True +# ----------------max_pool2d_with_index_fp16---------------- +def create_test_fp16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + ) + class TestMaxPool2dFP16(parent): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + if core.is_compiled_with_cuda(): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if core.is_float16_supported(place): + self.check_grad_with_place(place, {'X'}, ['Out']) + + cls_name = "{}_{}".format(parent.__name__, "FP16OP") + TestMaxPool2dFP16.__name__ = cls_name + globals()[cls_name] = TestMaxPool2dFP16 + + +create_test_fp16_class(TestCase4) +create_test_fp16_class(TestCase5) +create_test_fp16_class(TestCase6) +create_test_fp16_class(TestCase7) +create_test_fp16_class(TestCastAdaptive2d) + + +# ----------------max_pool2d_with_index_bf16---------------- +def create_test_bf16_class(parent): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA and do not support bfloat16", + ) + class TestMaxPool2dBF16(parent): + def init_dtype(self): + self.dtype = np.uint16 + + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op( + scope, self.op_type, self.inputs, self.outputs, self.attrs + ) + return get_numeric_gradient( + place, scope, op, self.inputs_fp32, check_name, ['Out'] + ) + + def test_check_output(self): + place = core.CUDAPlace(0) + if core.is_bfloat16_supported(place): + self.check_output_with_place(place) + + def test_check_grad(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'X') + if core.is_bfloat16_supported(place): + self.check_grad_with_place( + place, {'X'}, ['Out'], user_defined_grads=[numeric_grads] + ) + + cls_name = "{}_{}".format(parent.__name__, "BF16OP") + TestMaxPool2dBF16.__name__ = cls_name + globals()[cls_name] = TestMaxPool2dBF16 + + +create_test_bf16_class(TestCase4) +create_test_bf16_class(TestCase5) +create_test_bf16_class(TestCase6) +create_test_bf16_class(TestCase7) +create_test_bf16_class(TestCastAdaptive2d) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_prelu_op.py b/python/paddle/fluid/tests/unittests/test_prelu_op.py index a9d00c42219af..8588ed498d703 100644 --- a/python/paddle/fluid/tests/unittests/test_prelu_op.py +++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest, skip_check_grad_ci +from eager_op_test import OpTest, convert_float_to_uint16, skip_check_grad_ci import paddle import paddle.nn.functional as F @@ -174,7 +174,11 @@ def setUp(self): self.op_type = "prelu" self.python_api = prelu_api_wrapper - x_np = np.random.uniform(-1, 1, self.x_shape).astype(self.dtype) + if self.dtype == np.uint16: + as_type = self.np_dtype + else: + as_type = self.dtype + x_np = np.random.uniform(-1, 1, self.x_shape).astype(as_type) # Since zero point in prelu is not differentiable, avoid randomize # zero. x_np[np.abs(x_np) < 0.005] = 0.02 @@ -190,7 +194,7 @@ def setUp(self): alpha_np = np.random.uniform(-1, -0.5, [1, 1, 1, self.x_shape[-1]]) else: alpha_np = np.random.uniform(-1, -0.5, [1] + self.x_shape[1:]) - alpha_np = alpha_np.astype(self.dtype) + alpha_np = alpha_np.astype(as_type) self.inputs = {'X': x_np, 'Alpha': alpha_np} @@ -393,18 +397,48 @@ def test_check_output(self): def test_check_grad(self): place = core.CUDAPlace(0) if core.is_float16_supported(place) and check_grad: - self.check_grad_with_place( - place, - ['X', 'Alpha'], - 'Out', - max_relative_error=max_relative_error, - ) + # Use the default max_relative_error, not use max_relative_error + self.check_grad_with_place(place, ['X', 'Alpha'], 'Out') cls_name = "{}_{}".format(parent.__name__, "Fp16Op") TestPReluFp16Case.__name__ = cls_name globals()[cls_name] = TestPReluFp16Case +def create_test_bf16_class( + parent, check_grad=True, atol=1e-3, max_relative_error=0.05 +): + @unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", + ) + class TestPReluBF16Op(parent): + def setUp(self): + super().setUp() + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.inputs['Alpha'] = convert_float_to_uint16(self.inputs['Alpha']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + + def init_dtype(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=atol) + + def test_check_grad(self): + place = core.CUDAPlace(0) + if check_grad: + # Use the default max_relative_error, not use max_relative_error + self.check_grad_with_place(place, ['X', 'Alpha'], 'Out') + + cls_name = "{}_{}".format(parent.__name__, "BF16Op") + TestPReluBF16Op.__name__ = cls_name + globals()[cls_name] = TestPReluBF16Op + + create_test_fp16_class(TestModeElt) create_test_fp16_class(TestModeAllRank3) create_test_fp16_class(TestModeAllRank6) @@ -420,6 +454,21 @@ def test_check_grad(self): create_test_fp16_class(TestModeElementRank3NHWC) create_test_fp16_class(TestModeElementRank6NHWC) +create_test_bf16_class(TestModeElt) +create_test_bf16_class(TestModeAllRank3) +create_test_bf16_class(TestModeAllRank6) +create_test_bf16_class(TestModeChannelRank3) +create_test_bf16_class(TestModeChannelRank6) +create_test_bf16_class(TestModeElementRank3) +create_test_bf16_class(TestModeElementRank6) +create_test_bf16_class(TestModeEltNHWC) +create_test_bf16_class(TestModeAllRank3NHWC) +create_test_bf16_class(TestModeAllRank6NHWC) +create_test_bf16_class(TestModeChannelRank3NHWC) +create_test_bf16_class(TestModeChannelRank6NHWC) +create_test_bf16_class(TestModeElementRank3NHWC) +create_test_bf16_class(TestModeElementRank6NHWC) + def prelu_t(x, mode, param_attr=None, name=None, data_format='NCHW'): helper = fluid.layer_helper.LayerHelper('prelu', **locals()) diff --git a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py index 75be4531820fb..a2085cc416a19 100644 --- a/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py +++ b/python/paddle/fluid/tests/unittests/test_put_along_axis_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle.framework import core @@ -28,19 +28,18 @@ class TestPutAlongAxisOp(OpTest): def setUp(self): self.init_data() self.reduce_op = "assign" - self.dtype = 'float64' self.op_type = "put_along_axis" self.python_api = paddle.tensor.put_along_axis self.xnp = np.random.random(self.x_shape).astype(self.x_type) - # numpy put_along_axis is an inplace opearion. + # numpy put_along_axis is an inplace operation. self.xnp_result = copy.deepcopy(self.xnp) np.put_along_axis(self.xnp_result, self.index, self.value, self.axis) self.target = self.xnp_result broadcast_shape_list = list(self.x_shape) broadcast_shape_list[self.axis] = 1 - self.braodcast_shape = tuple(broadcast_shape_list) - self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape) - self.value_broadcast = np.broadcast_to(self.value, self.braodcast_shape) + self.broadcast_shape = tuple(broadcast_shape_list) + self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape) + self.value_broadcast = np.broadcast_to(self.value, self.broadcast_shape) self.inputs = { 'Input': self.xnp, 'Index': self.index_broadcast, @@ -56,6 +55,7 @@ def test_check_grad(self): self.check_grad(["Input", "Value"], "Result") def init_data(self): + self.dtype = 'float64' self.x_type = "float64" self.x_shape = (10, 10, 10) self.value_type = "float64" @@ -66,6 +66,71 @@ def init_data(self): self.axis_type = "int64" +class TestPutAlongAxisFP16Op(TestPutAlongAxisOp): + def init_data(self): + self.dtype = np.float16 + self.x_type = "float16" + self.x_shape = (10, 10, 10) + self.value_type = "float16" + self.value = np.array([99]).astype(self.value_type) + self.index_type = "int32" + self.index = np.array([[[0]]]).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestPutAlongAxisBF16Op(OpTest): + def setUp(self): + self.init_data() + self.reduce_op = "assign" + self.op_type = "put_along_axis" + self.python_api = paddle.tensor.put_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + # numpy put_along_axis is an inplace operation. + self.xnp_result = copy.deepcopy(self.xnp) + np.put_along_axis(self.xnp_result, self.index, self.value, self.axis) + self.target = self.xnp_result + broadcast_shape_list = list(self.x_shape) + broadcast_shape_list[self.axis] = 1 + self.broadcast_shape = tuple(broadcast_shape_list) + self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape) + self.value_broadcast = np.broadcast_to(self.value, self.broadcast_shape) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index_broadcast, + 'Value': self.value_broadcast, + } + self.attrs = {'Axis': self.axis, 'Reduce': self.reduce_op} + self.outputs = {'Result': self.target} + + self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input']) + self.inputs['Value'] = convert_float_to_uint16(self.inputs['Value']) + self.outputs['Result'] = convert_float_to_uint16(self.outputs['Result']) + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["Input", "Value"], "Result") + + def init_data(self): + self.dtype = np.uint16 + self.x_type = "float32" + self.x_shape = (10, 10, 10) + self.value_type = "float32" + self.value = np.array([99]).astype(self.value_type) + self.index_type = "int32" + self.index = np.array([[[0]]]).astype(self.index_type) + self.axis = 1 + self.axis_type = "int64" + + class TestPutAlongAxisAPI(unittest.TestCase): def setUp(self): np.random.seed(0) diff --git a/python/paddle/fluid/tests/unittests/test_randperm_op.py b/python/paddle/fluid/tests/unittests/test_randperm_op.py index eaecf087f9f04..5df2873b93ec4 100644 --- a/python/paddle/fluid/tests/unittests/test_randperm_op.py +++ b/python/paddle/fluid/tests/unittests/test_randperm_op.py @@ -15,7 +15,11 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import ( + OpTest, + convert_float_to_uint16, + convert_uint16_to_float, +) import paddle from paddle.fluid import core @@ -40,12 +44,21 @@ def error_msg(data_np): def convert_dtype(dtype_str): - dtype_str_list = ["int32", "int64", "float32", "float64"] + dtype_str_list = [ + "int32", + "int64", + "float16", + "float32", + "float64", + "uint16", + ] dtype_num_list = [ core.VarDesc.VarType.INT32, core.VarDesc.VarType.INT64, + core.VarDesc.VarType.FP16, core.VarDesc.VarType.FP32, core.VarDesc.VarType.FP64, + core.VarDesc.VarType.BF16, ] assert dtype_str in dtype_str_list, ( dtype_str + " should in " + str(dtype_str_list) @@ -62,9 +75,9 @@ def setUp(self): self.n = 200 self.dtype = "int64" + self.init_attrs() self.inputs = {} self.outputs = {"Out": np.zeros(self.n).astype(self.dtype)} - self.init_attrs() self.attrs = { "n": self.n, "dtype": convert_dtype(self.dtype), @@ -103,6 +116,47 @@ def init_attrs(self): self.dtype = "float64" +class TestRandpermFP16Op(TestRandpermOp): + def init_attrs(self): + self.dtype = "float16" + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestRandpermBF16Op(OpTest): + def setUp(self): + self.op_type = "randperm" + self.python_api = paddle.randperm + self.n = 200 + + self.init_attrs() + self.inputs = {} + self.outputs = {"Out": np.zeros(self.n).astype(self.np_dtype)} + self.attrs = { + "n": self.n, + "dtype": convert_dtype(self.dtype), + } + + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def init_attrs(self): + self.dtype = "uint16" + self.np_dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place_customized(self.verify_output, self.place) + + def verify_output(self, outs): + out_np = convert_uint16_to_float(np.array(outs[0])) + self.assertTrue( + check_randperm_out(self.n, out_np), msg=error_msg(out_np) + ) + + class TestRandpermOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 050879369244d..631b760a7b8da 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -418,6 +418,51 @@ def test_check_output(self): self.check_output() +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework." +) +class TestMinFP16Op(OpTest): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.python_api = paddle.min + self.public_python_api = paddle.min + self.init_dtype() + if self.dtype == np.uint16: + x = np.random.random((5, 6, 10)).astype(np.float32) + self.inputs = {'X': convert_float_to_uint16(x)} + else: + x = np.random.random((5, 6, 10)).astype(self.dtype) + self.inputs = {'X': x} + self.attrs = {'dim': [2], 'keep_dim': True} + out = x.min(axis=tuple(self.attrs['dim']), keepdims=True) + if self.dtype == np.uint16: + self.outputs = {'Out': convert_float_to_uint16(out)} + else: + self.outputs = {'Out': out} + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output() + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support the bfloat16", +) +class TestMinBF16Op(TestMinFP16Op): + def init_dtype(self): + self.dtype = np.uint16 + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0)) + + def raw_reduce_prod(x, dim=[0], keep_dim=False): return paddle.prod(x, dim, keep_dim) diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py index 3149ca82b3f62..29446bafbbfeb 100644 --- a/python/paddle/fluid/tests/unittests/test_split_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_op.py @@ -65,7 +65,7 @@ def test_check_grad(self): # test with attr(num) -class TestSplitOp_2(OpTest): +class TestSplitWithNumOp(OpTest): def setUp(self): self.python_api = paddle.split self.public_python_api = paddle.split @@ -74,18 +74,32 @@ def setUp(self): self.prim_op_type = "prim" self.dtype = self.get_dtype() self.init_data() - self.inputs = {'X': self.x} self.attrs = { 'axis': self.axis, 'sections': self.sections, 'num': self.num, } - - out = np.split(self.x, self.indices_or_sections, self.axis) - self.outputs = {'Out': [('out%d' % i, out[i]) for i in range(len(out))]} + if self.dtype == np.uint16: + self.inputs = {'X': convert_float_to_uint16(self.x)} + out = np.split(self.x, self.indices_or_sections, self.axis) + self.outputs = { + 'Out': [ + ('out%d' % i, convert_float_to_uint16(out[i])) + for i in range(len(out)) + ] + } + else: + self.inputs = {'X': self.x} + out = np.split(self.x, self.indices_or_sections, self.axis) + self.outputs = { + 'Out': [('out%d' % i, out[i]) for i in range(len(out))] + } def init_data(self): - self.x = np.random.random((4, 5, 6)).astype(self.dtype) + if self.dtype == np.uint16: + self.x = np.random.random((4, 5, 6)).astype(np.float32) + else: + self.x = np.random.random((4, 5, 6)).astype(self.dtype) self.axis = 2 self.sections = [] self.num = 3 @@ -240,28 +254,28 @@ def create_test_fp16(parent): @unittest.skipIf( not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) - class TestSplitFp16(parent): + class TestSplitFP16Op(parent): def get_dtype(self): return np.float16 - def test_check_grad(self): - pass - - cls_name = "{}_{}".format(parent.__name__, "Fp16") - TestSplitFp16.__name__ = cls_name - globals()[cls_name] = TestSplitFp16 + cls_name = "{}_{}".format(parent.__name__, "FP16Op") + TestSplitFP16Op.__name__ = cls_name + globals()[cls_name] = TestSplitFP16Op create_test_fp16(TestSplitOp) +create_test_fp16(TestSplitWithNumOp) # ----------------Split Bf16---------------- def create_test_bf16(parent): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", ) - class TestSplitBf16(parent): + class TestSplitBF16Op(parent): def get_dtype(self): return np.uint16 @@ -270,14 +284,16 @@ def test_check_output(self): self.check_output_with_place(place) def test_check_grad(self): - pass + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'out2') - cls_name = "{}_{}".format(parent.__name__, "Bf16") - TestSplitBf16.__name__ = cls_name - globals()[cls_name] = TestSplitBf16 + cls_name = "{}_{}".format(parent.__name__, "BF16Op") + TestSplitBF16Op.__name__ = cls_name + globals()[cls_name] = TestSplitBF16Op create_test_bf16(TestSplitOp) +create_test_bf16(TestSplitWithNumOp) class TestSplitAPI(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py index b5a9c2169ff96..4180a7c4dc758 100644 --- a/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py +++ b/python/paddle/fluid/tests/unittests/test_take_along_axis_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle.framework import core @@ -32,8 +32,8 @@ def setUp(self): self.target = np.take_along_axis(self.xnp, self.index, self.axis) broadcast_shape_list = list(self.x_shape) broadcast_shape_list[self.axis] = 1 - self.braodcast_shape = tuple(broadcast_shape_list) - self.index_broadcast = np.broadcast_to(self.index, self.braodcast_shape) + self.broadcast_shape = tuple(broadcast_shape_list) + self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape) self.inputs = { 'Input': self.xnp, 'Index': self.index_broadcast, @@ -58,6 +58,64 @@ def init_data(self): self.axis_type = "int64" +class TestTakeAlongAxisFP16Op(TestTakeAlongAxisOp): + def init_data(self): + self.dtype = np.float16 + self.x_type = "float16" + self.x_shape = (5, 5, 5) + self.index_type = "int32" + self.index = np.array([[[1]], [[1]], [[2]], [[4]], [[3]]]).astype( + self.index_type + ) + self.axis = 2 + self.axis_type = "int64" + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestTakeAlongAxisBF16Op(OpTest): + def setUp(self): + self.init_data() + self.op_type = "take_along_axis" + self.python_api = paddle.tensor.take_along_axis + self.xnp = np.random.random(self.x_shape).astype(self.x_type) + self.target = np.take_along_axis(self.xnp, self.index, self.axis) + broadcast_shape_list = list(self.x_shape) + broadcast_shape_list[self.axis] = 1 + self.broadcast_shape = tuple(broadcast_shape_list) + self.index_broadcast = np.broadcast_to(self.index, self.broadcast_shape) + self.inputs = { + 'Input': self.xnp, + 'Index': self.index_broadcast, + } + self.attrs = {'Axis': self.axis} + self.outputs = {'Result': self.target} + + self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input']) + self.outputs['Result'] = convert_float_to_uint16(self.outputs['Result']) + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['Input'], 'Result') + + def init_data(self): + self.dtype = np.uint16 + self.x_type = "float32" + self.x_shape = (5, 5, 5) + self.index_type = "int32" + self.index = np.array([[[1]], [[1]], [[2]], [[4]], [[3]]]).astype( + self.index_type + ) + self.axis = 2 + self.axis_type = "int64" + + class TestCase1(TestTakeAlongAxisOp): def init_data(self): self.x_type = "float64" diff --git a/python/paddle/fluid/tests/unittests/test_trace_op.py b/python/paddle/fluid/tests/unittests/test_trace_op.py index e910658721628..f13dd1701affe 100644 --- a/python/paddle/fluid/tests/unittests/test_trace_op.py +++ b/python/paddle/fluid/tests/unittests/test_trace_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle import fluid, tensor @@ -68,6 +68,82 @@ def init_config(self): ) +class TestTraceFP16Op1(TestTraceOp): + def init_config(self): + self.dtype = np.float16 + self.case = np.random.randn(20, 6).astype(self.dtype) + self.inputs = {'Input': self.case} + self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1} + self.target = np.trace(self.inputs['Input']) + + +class TestTraceFP16Op2(TestTraceOp): + def init_config(self): + self.dtype = np.float16 + self.case = np.random.randn(2, 20, 2, 3).astype(self.dtype) + self.inputs = {'Input': self.case} + self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1} + self.target = np.trace( + self.inputs['Input'], + offset=self.attrs['offset'], + axis1=self.attrs['axis1'], + axis2=self.attrs['axis2'], + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestTraceBF16Op1(OpTest): + def setUp(self): + self.op_type = "trace" + self.python_api = paddle.trace + self.init_config() + self.outputs = {'Out': self.target} + + self.inputs['Input'] = convert_float_to_uint16(self.inputs['Input']) + self.outputs['Out'] = convert_float_to_uint16(self.outputs['Out']) + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['Input'], 'Out', numeric_grad_delta=0.02 + ) + + def init_config(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + self.case = np.random.randn(20, 6).astype(self.np_dtype) + self.inputs = {'Input': self.case} + self.attrs = {'offset': 0, 'axis1': 0, 'axis2': 1} + self.target = np.trace(self.inputs['Input']) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestTraceBF16Op2(TestTraceBF16Op1): + def init_config(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + self.case = np.random.randn(2, 20, 2, 3).astype(self.np_dtype) + self.inputs = {'Input': self.case} + self.attrs = {'offset': -5, 'axis1': 1, 'axis2': -1} + self.target = np.trace( + self.inputs['Input'], + offset=self.attrs['offset'], + axis1=self.attrs['axis1'], + axis2=self.attrs['axis2'], + ) + + class TestTraceAPICase(unittest.TestCase): def test_case1(self): case = np.random.randn(2, 20, 2, 3).astype('float32') diff --git a/python/paddle/fluid/tests/unittests/test_unfold_op.py b/python/paddle/fluid/tests/unittests/test_unfold_op.py index 1e7a91ef960d1..2ed5bf3f82c85 100644 --- a/python/paddle/fluid/tests/unittests/test_unfold_op.py +++ b/python/paddle/fluid/tests/unittests/test_unfold_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from eager_op_test import OpTest +from eager_op_test import OpTest, convert_float_to_uint16 import paddle from paddle import fluid @@ -42,7 +42,11 @@ def init_data(self): self.input_height, self.input_width, ] - self.x = np.random.rand(*input_shape).astype(np.float64) + if self.dtype == np.uint16: + as_type = self.np_dtype + else: + as_type = self.dtype + self.x = np.random.rand(*input_shape).astype(as_type) def calc_unfold(self): output_shape = [0] * 3 @@ -77,7 +81,11 @@ def calc_unfold(self): + 1 ) output_shape[2] = out_height * out_width - output = np.zeros(output_shape).astype(np.float64) + if self.dtype == np.uint16: + as_type = self.np_dtype + else: + as_type = self.dtype + output = np.zeros(output_shape).astype(as_type) # ------------ calculate output -------------- # for i in range(output_shape[0]): for j in range(output_shape[1]): @@ -123,9 +131,13 @@ def set_data(self): def setUp(self): self.op_type = 'unfold' + self.init_dtype() self.python_api = paddle.nn.functional.unfold self.set_data() + def init_dtype(self): + self.dtype = np.float64 + def test_check_output(self): self.check_output() @@ -133,6 +145,55 @@ def test_check_grad(self): self.check_grad(['X'], 'Y') +class TestUnfoldFP16Op(TestUnfoldOp): + def init_dtype(self): + self.dtype = np.float16 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestUnfoldBF16Op(TestUnfoldOp): + # Notice: The test is time consuming, may cause timeout, modify the parameters to reduce the time + def init_data(self): + self.batch_size = 3 + self.input_channels = 3 + self.input_height = 5 + self.input_width = 5 + self.kernel_sizes = [3, 3] + self.strides = [1, 1] + self.paddings = [1, 1, 1, 1] + self.dilations = [1, 1] + input_shape = [ + self.batch_size, + self.input_channels, + self.input_height, + self.input_width, + ] + self.x = np.random.rand(*input_shape).astype(self.np_dtype) + + def init_dtype(self): + self.dtype = np.uint16 + self.np_dtype = np.float32 + + def setUp(self): + self.op_type = 'unfold' + self.init_dtype() + self.python_api = paddle.nn.functional.unfold + self.set_data() + self.inputs['X'] = convert_float_to_uint16(self.inputs['X']) + self.outputs['Y'] = convert_float_to_uint16(self.outputs['Y']) + self.place = core.CUDAPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Y') + + class TestUnfoldAPI(TestUnfoldOp): """ This is for test on paddle.nn.Unfold diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py index a704ae9a4ee21..c4a243f47897b 100644 --- a/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py +++ b/python/paddle/fluid/tests/unittests/test_uniform_random_inplace_op.py @@ -15,9 +15,19 @@ import unittest import numpy as np +from eager_op_test import OpTest, convert_uint16_to_float import paddle from paddle import fluid +from paddle.fluid import core + + +def output_hist(out): + hist, _ = np.histogram(out, range=(-1, 1)) + hist = hist.astype("float32") + hist /= float(out.size) + prob = 0.1 * np.ones(10) + return hist, prob class TestUniformRandomInplaceOpDtype(unittest.TestCase): @@ -44,6 +54,72 @@ def test_fp64(): test_fp64() +class TestUniformRandomInplaceFP16Op(OpTest): + def setUp(self): + self.op_type = "uniform_random_inplace" + self.dtype = np.float16 + self.shape = (1000, 784) + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + self.inputs = {"X": x} + self.outputs = {"Out": y} + self.init_attrs() + + def init_attrs(self): + self.output_hist = output_hist + + def test_check_output(self): + self.check_output_customized(self.verify_output) + + def verify_output(self, outs): + hist, prob = self.output_hist(np.array(outs[0])) + np.testing.assert_allclose(hist, prob, rtol=0, atol=0.001) + + # TODO: Due to the lack of the self.python_api=paddle.uniform_random_inplace setting, the dynamic graph is temporarily turned off, set check_dygraph=False + def test_check_grad(self): + self.check_grad(['X'], 'Out', check_dygraph=False) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestUniformRandomInplaceBF16Op(OpTest): + def setUp(self): + self.op_type = "uniform_random_inplace" + self.dtype = np.uint16 + self.shape = (1000, 784) + x = np.random.random(self.shape).astype(self.dtype) + y = np.random.random(self.shape).astype(self.dtype) + self.inputs = {'X': x} + self.outputs = {'Out': y} + self.init_attrs() + self.place = core.CUDAPlace(0) + + def init_attrs(self): + self.output_hist = output_hist + + def test_check_output(self): + self.check_output_with_place_customized(self.verify_output, self.place) + + def verify_output(self, outs): + result = convert_uint16_to_float(np.array(outs[0])) + hist, prob = self.output_hist(result) + np.testing.assert_allclose(hist, prob, rtol=0, atol=0.002) + + # TODO: Due to the lack of the self.python_api=paddle.uniform_random_inplace setting, the dynamic graph is temporarily turned off, set check_dygraph=False + def test_check_grad(self): + grads = [paddle.zeros(self.shape, dtype=self.dtype)] + self.check_grad_with_place( + self.place, + ['X'], + 'Out', + check_dygraph=False, + user_defined_grads=grads, + ) + + class TestUniformRandomInplaceOpIsInplace(unittest.TestCase): def setUp(self): self.shape = (1000, 784) diff --git a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py index 5ed9d0c85915d..6060236c5c5c9 100644 --- a/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py +++ b/python/paddle/fluid/tests/unittests/test_update_loss_scaling_op.py @@ -15,10 +15,11 @@ import unittest import numpy as np -from eager_op_test import OpTest, paddle_static_guard +from eager_op_test import OpTest, convert_float_to_uint16, paddle_static_guard import paddle from paddle import fluid +from paddle.fluid import core from paddle.static.amp import amp_nn @@ -81,8 +82,10 @@ def setUp(self): def init(self): self.incr_ratio = 2.0 self.decr_ratio = 0.8 - self.dtype = np.float32 - self.prev_loss_scaling = np.array([2048]).astype(self.dtype) + self.init_dtype() + self.prev_loss_scaling = np.array([2048]).astype( + self.loss_scaling_dtype + ) self.num_good_steps = np.array([999], dtype=np.int32) self.num_bad_steps = np.array([1], dtype=np.int32) self.zero_steps = np.array([0], dtype=np.int32) @@ -94,10 +97,77 @@ def init(self): 'decr_ratio': self.decr_ratio, } + def init_dtype(self): + self.dtype = np.float32 + self.loss_scaling_dtype = np.float32 + def test_check_output(self): self.check_output(no_check_set=['Out']) +class TestUpdateLossScalingFP16Op(TestUpdateLossScalingOp): + def init_dtype(self): + self.dtype = np.float16 + self.loss_scaling_dtype = np.float32 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not compiled with CUDA or not support bfloat16", +) +class TestUpdateLossScalingBF16Op(OpTest): + def init(self): + self.incr_ratio = 2.0 + self.decr_ratio = 0.8 + self.dtype = np.uint16 + self.np_dtype = "float32" + self.prev_loss_scaling = np.array([2048]).astype(self.np_dtype) + self.num_good_steps = np.array([999], dtype=np.int32) + self.num_bad_steps = np.array([1], dtype=np.int32) + self.zero_steps = np.array([0], dtype=np.int32) + self.stop_update = np.array([False], dtype=np.bool_) + self.attrs = { + 'incr_every_n_steps': 1000, + 'decr_every_n_nan_or_inf': 2, + 'incr_ratio': self.incr_ratio, + 'decr_ratio': self.decr_ratio, + } + + def setUp(self): + self.op_type = "update_loss_scaling" + self.init() + self.python_api = update_loss_scaling_wrapper + self.python_out_sig = [ + "out0", + "LossScaling", + "OutGoodSteps", + "OutBadSteps", + ] + found_inf = np.array([False], dtype=np.bool_) + x = np.random.random((1024, 1024)).astype(self.np_dtype) + + self.inputs = { + 'X': [('x0', convert_float_to_uint16(x))], + 'FoundInfinite': found_inf, + # do not convert + 'PrevLossScaling': self.prev_loss_scaling, + 'InGoodSteps': self.num_good_steps, + 'InBadSteps': self.num_bad_steps, + } + + self.outputs = { + 'Out': [('out0', convert_float_to_uint16(x))], + # do not convert + 'LossScaling': self.prev_loss_scaling * self.incr_ratio, + 'OutGoodSteps': self.zero_steps, + 'OutBadSteps': self.zero_steps, + } + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0), no_check_set=['Out']) + + class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp): def setUp(self): self.op_type = "update_loss_scaling" diff --git a/python/paddle/nn/clip.py b/python/paddle/nn/clip.py index a6e70a5d08ec2..1952ea0514b50 100644 --- a/python/paddle/nn/clip.py +++ b/python/paddle/nn/clip.py @@ -63,7 +63,9 @@ def clip_by_norm(x, max_norm, name=None): return _legacy_C_ops.clip_by_norm(x, 'max_norm', max_norm) helper = LayerHelper("clip_by_norm", **locals()) - check_variable_and_dtype(x, 'X', ['float32', 'float16'], 'clip_by_norm') + check_variable_and_dtype( + x, 'X', ['float16', 'float32', 'uint16'], 'clip_by_norm' + ) check_type(max_norm, 'max_norm', (float), 'clip_by_norm') if name is None: diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 8fd603f4379f0..d89ce9cb4537a 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -538,10 +538,13 @@ def prelu(x, weight, data_format="NCHW", name=None): return _C_ops.prelu(x, weight, data_format, mode) else: check_variable_and_dtype( - x, 'x', ['float16', 'float32', 'float64'], 'prelu' + x, 'x', ['float16', 'float32', 'float64', 'uint16'], 'prelu' ) check_variable_and_dtype( - weight, 'weight', ['float16', 'float32', 'float64'], 'prelu' + weight, + 'weight', + ['float16', 'float32', 'float64', 'uint16'], + 'prelu', ) helper = LayerHelper('prelu', **locals()) out = helper.create_variable_for_type_inference(x.dtype) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index dacbeb7f30857..e36af4a8eae53 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -2498,7 +2498,7 @@ def multi_dot(x, name=None): check_variable_and_dtype( item, 'x[' + str(id) + ']', - ['float16', 'float32', 'float64'], + ['float16', 'float32', 'float64', 'uint16'], 'multi_dot', ) if item.dtype != x[0].dtype: diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index fee880f5bf695..c2b4ec7126388 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -1240,7 +1240,15 @@ def broadcast_tensors(input, name=None): check_variable_and_dtype( x, 'input[' + str(id) + ']', - ['bool', 'float32', 'float64', 'int32', 'int64'], + [ + 'bool', + 'float16', + 'float32', + 'float64', + 'int32', + 'int64', + 'uint16', + ], 'broadcast_tensors', ) if x.dtype != input[0].dtype: @@ -1977,6 +1985,7 @@ def split(x, num_or_sections, axis=0, name=None): 'int32', 'int64', 'uint8', + 'uint16', 'int8', ], 'split', @@ -4559,7 +4568,15 @@ def take_along_axis(arr, indices, axis): check_variable_and_dtype( arr, 'x', - ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], + [ + 'float16', + 'float32', + 'float64', + 'int32', + 'int64', + 'uint8', + 'uint16', + ], 'take_along_axis', ) check_variable_and_dtype( @@ -4631,7 +4648,15 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'): check_variable_and_dtype( arr, 'x', - ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], + [ + 'float16', + 'float32', + 'float64', + 'int32', + 'int64', + 'uint8', + 'uint16', + ], 'put_along_axis', ) check_variable_and_dtype( @@ -4713,7 +4738,7 @@ def index_add(x, index, axis, value, name=None): check_variable_and_dtype( x, 'x', - ['float16', 'float32', 'float64', 'int32', 'int64'], + ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'], 'paddle.tensor.manipulation.index_add', ) check_variable_and_dtype( @@ -4725,7 +4750,7 @@ def index_add(x, index, axis, value, name=None): check_variable_and_dtype( value, 'add_value', - ['float16', 'float32', 'float64', 'int32', 'int64'], + ['float16', 'float32', 'float64', 'int32', 'int64', 'uint16'], 'paddle.tensor.manipulation.index_add', ) diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index efa8cbfc54e02..bd117715bed49 100644 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -1958,10 +1958,14 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None): helper = LayerHelper("addmm", **locals()) check_variable_and_dtype( - input, 'Input', ['float32', 'float64'], 'addmm' + input, 'Input', ['float16', 'float32', 'float64', 'uint16'], 'addmm' + ) + check_variable_and_dtype( + x, 'X', ['float16', 'float32', 'float64', 'uint16'], 'addmm' + ) + check_variable_and_dtype( + y, 'Y', ['float16', 'float32', 'float64', 'uint16'], 'addmm' ) - check_variable_and_dtype(x, 'X', ['float32', 'float64'], 'addmm') - check_variable_and_dtype(y, 'Y', ['float32', 'float64'], 'addmm') out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( @@ -2456,7 +2460,10 @@ def min(x, axis=None, keepdim=False, name=None): reduce_all, axis = _get_reduce_axis_with_tensor(axis, x) helper = LayerHelper('min', **locals()) check_variable_and_dtype( - x, 'x', ['float32', 'float64', 'int32', 'int64'], 'min' + x, + 'x', + ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'], + 'min', ) out = helper.create_variable_for_type_inference(dtype=x.dtype)