From 538765867edb49cf84511ed19c0301650f8d2af1 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Fri, 11 Feb 2022 19:45:22 +0800 Subject: [PATCH 01/10] pten matmul cuda kernel support bf16 --- paddle/fluid/operators/math/blas_impl.cu.h | 239 ++++++++++++++++++ paddle/pten/kernels/gpu/matmul_kernel.cu | 1 + .../unittests/npu/test_matmulv2_op_npu.py | 68 ++--- .../tests/unittests/test_matmul_v2_op.py | 163 +++++++++--- .../unittests/xpu/test_matmul_v2_op_xpu.py | 36 +-- 5 files changed, 414 insertions(+), 93 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 7ffd2a7ab2d84..12ae5190bc37a 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -813,6 +813,103 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 8000 } +template <> +template <> +inline void Blas::GEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "cublas fp16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + +#if CUDA_VERSION >= 11000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A, + CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + // TODO(kexinzhao): add processing code for compute capability < 53 case + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "cublas fp16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + +#if CUDA_VERSION >= 11000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx( + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A, + CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmEx with bfloat16 is not supported on cuda <= 11")); + +#endif // CUDA_VERSION >= 11000 +} + template <> template <> inline void Blas::GEMM( @@ -1208,6 +1305,42 @@ inline void Blas::GEMV(bool trans_a, int M, int N, } } +template <> +template <> +inline void Blas::GEMV( + bool trans_a, int M, int N, platform::bfloat16 alpha, + const platform::bfloat16 *A, const platform::bfloat16 *B, + platform::bfloat16 beta, platform::bfloat16 *C) const { + // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve + // it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + +template <> +template <> +inline void Blas::GEMV(bool trans_a, int M, int N, + platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve + // it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + template <> template void Blas::BatchedGEMM( @@ -1306,6 +1439,86 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, #endif // CUDA_VERSION >= 9010 } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t strideC = M * N; + +#if CUDA_VERSION >= 11000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( + handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb, + strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc, + strideC, batchCount, CUBLAS_COMPUTE_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= " + "11")); +#endif // CUDA_VERSION >= 11000 +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int64_t strideC = M * N; + +#if CUDA_VERSION >= 11000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = context_.tensor_core_available(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False"); + + context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( + handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb, + strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc, + strideC, batchCount, CUBLAS_COMPUTE_32F, algo)); + }); +#else + // raise error + PADDLE_THROW(platform::errors::Unimplemented( + "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= " + "11")); +#endif // CUDA_VERSION >= 11000 +} + template <> template void Blas::BatchedGEMM( @@ -1356,6 +1569,32 @@ inline void Blas::BatchedGEMM( } } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + template <> template void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu index f9fdbd27bf94e..b365581e949c1 100644 --- a/paddle/pten/kernels/gpu/matmul_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_kernel.cu @@ -27,5 +27,6 @@ PT_REGISTER_KERNEL(matmul, float, double, paddle::platform::float16, + paddle::platform::bfloat16, paddle::platform::complex, paddle::platform::complex) {} diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py index 882043ef6eb91..23ca0cf1f492f 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py @@ -71,7 +71,7 @@ def test_check_grad(self): self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') -class TestMatMuklOp2(TestMatMulV2Op): +class TestMatMulOp2(TestMatMulV2Op): """ case 2 """ @@ -83,7 +83,7 @@ def config(self): self.trans_y = True -class TestMatMuklOp3(TestMatMulV2Op): +class TestMatMulOp3(TestMatMulV2Op): """ case 3 """ @@ -95,7 +95,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp4(TestMatMulV2Op): +class TestMatMulOp4(TestMatMulV2Op): """ case 4 """ @@ -107,7 +107,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp5(TestMatMulV2Op): +class TestMatMulOp5(TestMatMulV2Op): """ case 5 """ @@ -119,7 +119,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp6(TestMatMulV2Op): +class TestMatMulOp6(TestMatMulV2Op): """ case 6 """ @@ -131,7 +131,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp7(TestMatMulV2Op): +class TestMatMulOp7(TestMatMulV2Op): """ case 7 """ @@ -143,7 +143,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp8(TestMatMulV2Op): +class TestMatMulOp8(TestMatMulV2Op): """ case 8 """ @@ -155,7 +155,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp9(TestMatMulV2Op): +class TestMatMulOp9(TestMatMulV2Op): """ case 9 """ @@ -167,7 +167,7 @@ def config(self): self.trans_y = True -class TestMatMuklOp10(TestMatMulV2Op): +class TestMatMulOp10(TestMatMulV2Op): """ case 10 """ @@ -179,7 +179,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp11(TestMatMulV2Op): +class TestMatMulOp11(TestMatMulV2Op): """ case 11 """ @@ -191,7 +191,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp12(TestMatMulV2Op): +class TestMatMulOp12(TestMatMulV2Op): """ case 12 """ @@ -203,7 +203,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp13(TestMatMulV2Op): +class TestMatMulOp13(TestMatMulV2Op): """ case 13 """ @@ -215,7 +215,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp14(TestMatMulV2Op): +class TestMatMulOp14(TestMatMulV2Op): """ case 14_1 """ @@ -227,7 +227,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp15(TestMatMulV2Op): +class TestMatMulOp15(TestMatMulV2Op): """ case 14_2 """ @@ -239,7 +239,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp16(TestMatMulV2Op): +class TestMatMulOp16(TestMatMulV2Op): """ case 16 : to check the gradient for special case """ @@ -251,7 +251,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp17(TestMatMulV2Op): +class TestMatMulOp17(TestMatMulV2Op): """ case 17 : to check the gradient for special case """ @@ -263,7 +263,7 @@ def config(self): self.trans_y = False -class TestMatMuklOpBroadcast1(TestMatMulV2Op): +class TestMatMulOpBroadcast1(TestMatMulV2Op): """ case 14_3 """ @@ -275,7 +275,7 @@ def config(self): self.trans_y = True -class TestMatMuklOpBroadcast2(TestMatMulV2Op): +class TestMatMulOpBroadcast2(TestMatMulV2Op): """ case 14_4 """ @@ -310,22 +310,22 @@ def test_check_grad(self): create_test_fp16_class(TestMatMulV2Op) -create_test_fp16_class(TestMatMuklOp2) -create_test_fp16_class(TestMatMuklOp3) -create_test_fp16_class(TestMatMuklOp4) -create_test_fp16_class(TestMatMuklOp5) -create_test_fp16_class(TestMatMuklOp6) -create_test_fp16_class(TestMatMuklOp7) -create_test_fp16_class(TestMatMuklOp8) -create_test_fp16_class(TestMatMuklOp9) -create_test_fp16_class(TestMatMuklOp10) -create_test_fp16_class(TestMatMuklOp11) -create_test_fp16_class(TestMatMuklOp12) -create_test_fp16_class(TestMatMuklOp13) -create_test_fp16_class(TestMatMuklOp14) -create_test_fp16_class(TestMatMuklOp15) -create_test_fp16_class(TestMatMuklOp16) -create_test_fp16_class(TestMatMuklOp17) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp7) +create_test_fp16_class(TestMatMulOp8) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) +create_test_fp16_class(TestMatMulOp14) +create_test_fp16_class(TestMatMulOp15) +create_test_fp16_class(TestMatMulOp16) +create_test_fp16_class(TestMatMulOp17) class TestMatMulV2API(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index efcc0e4cfe323..83e60dcb322af 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -16,7 +16,8 @@ import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient +from paddle.fluid.tests.unittests.testsuite import create_op import paddle.fluid.core as core import paddle @@ -73,17 +74,32 @@ def setUp(self): self.init_kernel_type() self.config() self.op_type = "matmul_v2" - x = np.random.random(self.x_shape).astype(self.dtype) - y = np.random.random(self.y_shape).astype(self.dtype) + if self.is_bfloat16_op(): + x = np.random.random(self.x_shape).astype(np.float32) + y = np.random.random(self.y_shape).astype(np.float32) + else: + x = np.random.random(self.x_shape).astype(self.dtype) + y = np.random.random(self.y_shape).astype(self.dtype) # -0.1 ~ 0.1 x = -0.1 + 0.2 * x y = -0.1 + 0.2 * y result = reference_matmul(x, y, self.trans_x, self.trans_y) result = result.astype(self.dtype) - self.inputs = { - 'X': x, - 'Y': y, - } + if self.is_bfloat16_op(): + output = output.astype(np.float32) + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y), + } + self.inputs_fp32 = { + 'X': x, + 'Y': y, + } + else: + self.inputs = { + 'X': x, + 'Y': y, + } self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y} self.outputs = {'Out': result} @@ -97,7 +113,7 @@ def test_check_grad(self): self.check_grad(['X', 'Y'], 'Out') -class TestMatMuklOp2(TestMatMulV2Op): +class TestMatMulOp2(TestMatMulV2Op): """ case 2 """ @@ -109,7 +125,7 @@ def config(self): self.trans_y = True -class TestMatMuklOp3(TestMatMulV2Op): +class TestMatMulOp3(TestMatMulV2Op): """ case 3 """ @@ -121,7 +137,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp4(TestMatMulV2Op): +class TestMatMulOp4(TestMatMulV2Op): """ case 4 """ @@ -133,7 +149,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp5(TestMatMulV2Op): +class TestMatMulOp5(TestMatMulV2Op): """ case 5 """ @@ -145,7 +161,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp6(TestMatMulV2Op): +class TestMatMulOp6(TestMatMulV2Op): """ case 6 """ @@ -157,7 +173,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp7(TestMatMulV2Op): +class TestMatMulOp7(TestMatMulV2Op): """ case 7 """ @@ -169,7 +185,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp8(TestMatMulV2Op): +class TestMatMulOp8(TestMatMulV2Op): """ case 8 """ @@ -181,7 +197,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp9(TestMatMulV2Op): +class TestMatMulOp9(TestMatMulV2Op): """ case 9 """ @@ -193,7 +209,7 @@ def config(self): self.trans_y = True -class TestMatMuklOp10(TestMatMulV2Op): +class TestMatMulOp10(TestMatMulV2Op): """ case 10 """ @@ -205,7 +221,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp11(TestMatMulV2Op): +class TestMatMulOp11(TestMatMulV2Op): """ case 11 """ @@ -217,7 +233,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp12(TestMatMulV2Op): +class TestMatMulOp12(TestMatMulV2Op): """ case 12 """ @@ -229,7 +245,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp13(TestMatMulV2Op): +class TestMatMulOp13(TestMatMulV2Op): """ case 13 """ @@ -241,7 +257,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp14(TestMatMulV2Op): +class TestMatMulOp14(TestMatMulV2Op): """ case 14_1 """ @@ -253,7 +269,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp15(TestMatMulV2Op): +class TestMatMulOp15(TestMatMulV2Op): """ case 14_2 """ @@ -265,7 +281,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp16(TestMatMulV2Op): +class TestMatMulOp16(TestMatMulV2Op): """ case 16 : to check the gradient for special case """ @@ -277,7 +293,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp17(TestMatMulV2Op): +class TestMatMulOp17(TestMatMulV2Op): """ case 17 : to check the gradient for special case """ @@ -289,7 +305,7 @@ def config(self): self.trans_y = False -class TestMatMuklOpBroadcast1(TestMatMulV2Op): +class TestMatMulOpBroadcast1(TestMatMulV2Op): """ case 14_3 """ @@ -301,7 +317,7 @@ def config(self): self.trans_y = True -class TestMatMuklOpBroadcast2(TestMatMulV2Op): +class TestMatMulOpBroadcast2(TestMatMulV2Op): """ case 14_4 """ @@ -343,22 +359,87 @@ def test_check_grad(self): create_test_fp16_class(TestMatMulV2Op) -create_test_fp16_class(TestMatMuklOp2) -create_test_fp16_class(TestMatMuklOp3) -create_test_fp16_class(TestMatMuklOp4) -create_test_fp16_class(TestMatMuklOp5) -create_test_fp16_class(TestMatMuklOp6) -create_test_fp16_class(TestMatMuklOp7) -create_test_fp16_class(TestMatMuklOp8) -create_test_fp16_class(TestMatMuklOp9) -create_test_fp16_class(TestMatMuklOp10) -create_test_fp16_class(TestMatMuklOp11) -create_test_fp16_class(TestMatMuklOp12) -create_test_fp16_class(TestMatMuklOp13) -create_test_fp16_class(TestMatMuklOp14) -create_test_fp16_class(TestMatMuklOp15) -create_test_fp16_class(TestMatMuklOp16) -create_test_fp16_class(TestMatMuklOp17) +create_test_fp16_class(TestMatMulOp2) +create_test_fp16_class(TestMatMulOp3) +create_test_fp16_class(TestMatMulOp4) +create_test_fp16_class(TestMatMulOp5) +create_test_fp16_class(TestMatMulOp6) +create_test_fp16_class(TestMatMulOp7) +create_test_fp16_class(TestMatMulOp8) +create_test_fp16_class(TestMatMulOp9) +create_test_fp16_class(TestMatMulOp10) +create_test_fp16_class(TestMatMulOp11) +create_test_fp16_class(TestMatMulOp12) +create_test_fp16_class(TestMatMulOp13) +create_test_fp16_class(TestMatMulOp14) +create_test_fp16_class(TestMatMulOp15) +create_test_fp16_class(TestMatMulOp16) +create_test_fp16_class(TestMatMulOp17) + +#--------------------test matmul bf16-------------------- + + +def create_test_bf16_class(parent, atol=0.01): + @unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0" + ) + class TestMatMulOpBf16Case(parent): + def get_numeric_grad(self, place, check_name): + scope = core.Scope() + self._check_grad_helper() + op = create_op(scope, self.op_type, self.inputs, self.outputs, + self.attrs) + return get_numeric_gradient(place, scope, op, self.inputs_fp32, + check_name, ['Output']) + + def init_kernel_type(self): + self.dtype = np.uint16 + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=atol) + + def test_check_grad_x(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'X') + self.check_grad_with_place( + place, ['X'], + 'Out', + no_grad_set=set(['Y']), + user_defined_grads=[numeric_grads]) + + def test_check_grad_y(self): + place = core.CUDAPlace(0) + numeric_grads = self.get_numeric_grad(place, 'Y') + self.check_grad_with_place( + place, ['Y'], + 'Out', + no_grad_set=set(['X']), + user_defined_grads=[numeric_grads]) + + cls_name = "{0}_{1}".format(parent.__name__, "Bf16") + TestMatMulOpBf16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpBf16Case + + +create_test_bf16_class(TestMatMulV2Op) +create_test_bf16_class(TestMatMulOp2) +create_test_bf16_class(TestMatMulOp3) +create_test_bf16_class(TestMatMulOp4) +create_test_bf16_class(TestMatMulOp5) +create_test_bf16_class(TestMatMulOp6) +create_test_bf16_class(TestMatMulOp7) +create_test_bf16_class(TestMatMulOp8) +create_test_bf16_class(TestMatMulOp9) +create_test_bf16_class(TestMatMulOp10) +create_test_bf16_class(TestMatMulOp11) +create_test_bf16_class(TestMatMulOp12) +create_test_bf16_class(TestMatMulOp13) +create_test_bf16_class(TestMatMulOp14) +create_test_bf16_class(TestMatMulOp15) +create_test_bf16_class(TestMatMulOp16) +create_test_bf16_class(TestMatMulOp17) class TestMatMulV2API(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py index 435026220c2b5..45d60c8538e09 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py @@ -97,7 +97,7 @@ def test_check_grad(self): self.check_grad_with_place(place, ['X', 'Y'], 'Out') -class TestMatMuklOp2(TestMatMulV2Op): +class TestMatMulOp2(TestMatMulV2Op): """ case 2 """ @@ -109,7 +109,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp3(TestMatMulV2Op): +class TestMatMulOp3(TestMatMulV2Op): """ case 3 """ @@ -121,7 +121,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp4(TestMatMulV2Op): +class TestMatMulOp4(TestMatMulV2Op): """ case 4 """ @@ -133,7 +133,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp5(TestMatMulV2Op): +class TestMatMulOp5(TestMatMulV2Op): """ case 5 """ @@ -145,7 +145,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp6(TestMatMulV2Op): +class TestMatMulOp6(TestMatMulV2Op): """ case 6 """ @@ -157,7 +157,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp7(TestMatMulV2Op): +class TestMatMulOp7(TestMatMulV2Op): """ case 7 """ @@ -169,7 +169,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp8(TestMatMulV2Op): +class TestMatMulOp8(TestMatMulV2Op): """ case 8 """ @@ -181,7 +181,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp9(TestMatMulV2Op): +class TestMatMulOp9(TestMatMulV2Op): """ case 9 """ @@ -193,7 +193,7 @@ def config(self): self.trans_y = True -class TestMatMuklOp10(TestMatMulV2Op): +class TestMatMulOp10(TestMatMulV2Op): """ case 10 """ @@ -205,7 +205,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp11(TestMatMulV2Op): +class TestMatMulOp11(TestMatMulV2Op): """ case 11 """ @@ -217,7 +217,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp12(TestMatMulV2Op): +class TestMatMulOp12(TestMatMulV2Op): """ case 12 """ @@ -229,7 +229,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp13(TestMatMulV2Op): +class TestMatMulOp13(TestMatMulV2Op): """ case 13 """ @@ -241,7 +241,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp14(TestMatMulV2Op): +class TestMatMulOp14(TestMatMulV2Op): """ case 14_1 """ @@ -253,7 +253,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp15(TestMatMulV2Op): +class TestMatMulOp15(TestMatMulV2Op): """ case 14_2 """ @@ -265,7 +265,7 @@ def config(self): self.trans_y = True -class TestMatMuklOp16(TestMatMulV2Op): +class TestMatMulOp16(TestMatMulV2Op): """ case 16 : to check the big data """ @@ -277,7 +277,7 @@ def config(self): self.trans_y = False -class TestMatMuklOp17(TestMatMulV2Op): +class TestMatMulOp17(TestMatMulV2Op): """ case 17 : to check the gradient for special case """ @@ -289,7 +289,7 @@ def config(self): self.trans_y = False -# class TestMatMuklOpBroadcast1(TestMatMulV2Op): +# class TestMatMulOpBroadcast1(TestMatMulV2Op): # """ # case 14_3 # """ @@ -300,7 +300,7 @@ def config(self): # self.trans_x = True # self.trans_y = True -# class TestMatMuklOpBroadcast2(TestMatMulV2Op): +# class TestMatMulOpBroadcast2(TestMatMulV2Op): # """ # case 14_4 # """ From f6254c97f97f9960d136a93299cb0b6871c37187 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 14 Feb 2022 12:12:45 +0800 Subject: [PATCH 02/10] fix pten kernel name --- paddle/fluid/framework/pten_utils.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index 265bd99593dc4..2048d5fd353d8 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -185,8 +185,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { } KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() { - return KernelSignature(op_proto_->type(), GetInputArgsNames(), - GetAttrsArgsNames(), GetOutputArgsNames()); + return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()), + GetInputArgsNames(), GetAttrsArgsNames(), + GetOutputArgsNames()); } std::once_flag kernel_sig_map_init_flag; From cd420d0404e82d1dd1af78db06f8f3c486df4a80 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 14 Feb 2022 12:13:26 +0800 Subject: [PATCH 03/10] add matmul_grad bf16 kernel --- .../platform/device/gpu/cuda/cuda_device_function.h | 11 +++++++++++ paddle/pten/kernels/gpu/matmul_grad_kernel.cu | 1 + 2 files changed, 12 insertions(+) diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index cd78a89088cc6..caa40d555fa20 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -16,6 +16,7 @@ limitations under the License. */ // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" @@ -61,6 +62,16 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, static_cast(delta), width)); } +#if defined(PADDLE_CUDA_BF16) +template <> +__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask, + bfloat16 val, int delta, + int width) { + return bfloat16(__shfl_down_sync(mask, static_cast(val), + static_cast(delta), width)); +} +#endif + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( unsigned mask, paddle::platform::complex val, int delta, int width) { diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu index 31c44673f94e7..7df99260aa161 100644 --- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu +++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu @@ -26,6 +26,7 @@ PT_REGISTER_KERNEL(matmul_grad, float, double, paddle::platform::float16, + paddle::platform::bfloat16, paddle::platform::complex, paddle::platform::complex) {} From 7a3bcc87c39d7126eb1a96ae9a5eea58d3c788c5 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 14 Feb 2022 13:24:37 +0800 Subject: [PATCH 04/10] add emptylike bf16 kernel --- paddle/pten/kernels/empty_kernel.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc index ecb058d35b909..e1a1788815ebf 100644 --- a/paddle/pten/kernels/empty_kernel.cc +++ b/paddle/pten/kernels/empty_kernel.cc @@ -94,6 +94,7 @@ PT_REGISTER_KERNEL(empty_like, int64_t, bool, paddle::platform::float16, + paddle::platform::bfloat16, paddle::platform::complex, paddle::platform::complex) {} #endif From f114ea0228530a7fc66d8838360c2a3580fe029f Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 14 Feb 2022 15:34:38 +0800 Subject: [PATCH 05/10] fix compile --- .../fluid/platform/device/gpu/cuda/cuda_device_function.h | 7 +++++-- .../fluid/platform/device/gpu/rocm/rocm_device_function.h | 8 ++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index caa40d555fa20..5aff7fbb221c5 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -62,15 +62,18 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, static_cast(delta), width)); } -#if defined(PADDLE_CUDA_BF16) template <> __forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask, bfloat16 val, int delta, int width) { +#if defined(PADDLE_CUDA_BF16) return bfloat16(__shfl_down_sync(mask, static_cast(val), static_cast(delta), width)); -} +#else + PADDLE_THROW(platform::errors::Unimplemented( + "__shfl_down_sync with bfloat16 is not supported on cuda <= 11")); #endif +} template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index 13ffc2396946c..24725d9741cc3 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -59,6 +59,14 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, static_cast(delta), width)); } +template <> +__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask, + bfloat16 val, int delta, + int width) { + return bfloat16(__shfl_down(static_cast(val), + static_cast(delta), width)); +} + template <> __forceinline__ __device__ paddle::platform::complex CudaShuffleDownSync( unsigned mask, paddle::platform::complex val, int delta, int width) { From ef557ea9abfc19f35f542d49c7145f15f4e8c74b Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 14 Feb 2022 17:05:25 +0800 Subject: [PATCH 06/10] suppport rocm --- paddle/fluid/operators/math/blas_impl.cu.h | 11 +- paddle/fluid/operators/math/blas_impl.hip.h | 195 ++++++++++++++++++++ 2 files changed, 200 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 12ae5190bc37a..8912289a87870 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -820,6 +820,7 @@ inline void Blas::GEMM( platform::bfloat16 alpha, const platform::bfloat16 *A, const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C) const { +#if CUDA_VERSION >= 11000 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -840,7 +841,6 @@ inline void Blas::GEMM( float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); -#if CUDA_VERSION >= 11000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; bool use_tensor_op_math = context_.tensor_core_available(); if (use_tensor_op_math) { @@ -869,6 +869,7 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C) const { +#if CUDA_VERSION >= 11000 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -878,18 +879,16 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - // TODO(kexinzhao): add processing code for compute capability < 53 case PADDLE_ENFORCE_GE( context_.GetComputeCapability(), 80, platform::errors::InvalidArgument( - "cublas fp16 gemm requires GPU compute capability >= 80," + "cublas bf16 gemm requires GPU compute capability >= 80," "but received %d", context_.GetComputeCapability())); float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); -#if CUDA_VERSION >= 11000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT; bool use_tensor_op_math = context_.tensor_core_available(); if (use_tensor_op_math) { @@ -1446,6 +1445,7 @@ inline void Blas::BatchedGEMM( platform::bfloat16 alpha, const platform::bfloat16 *A, const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, int batchCount, int64_t strideA, int64_t strideB) const { +#if CUDA_VERSION >= 11000 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -1457,7 +1457,6 @@ inline void Blas::BatchedGEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; -#if CUDA_VERSION >= 11000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; bool use_tensor_op_math = context_.tensor_core_available(); if (use_tensor_op_math) { @@ -1486,6 +1485,7 @@ inline void Blas::BatchedGEMM( platform::bfloat16 alpha, const platform::bfloat16 *A, const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, int batchCount, int64_t strideA, int64_t strideB) const { +#if CUDA_VERSION >= 11000 // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -1497,7 +1497,6 @@ inline void Blas::BatchedGEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; -#if CUDA_VERSION >= 11000 cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; bool use_tensor_op_math = context_.tensor_core_available(); if (use_tensor_op_math) { diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h index bf7d66f485327..86293e21f164b 100644 --- a/paddle/fluid/operators/math/blas_impl.hip.h +++ b/paddle/fluid/operators/math/blas_impl.hip.h @@ -550,6 +550,80 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, rocblas_datatype_f16_r, N, rocblas_datatype_f32_r); } +template <> +template <> +inline void Blas::GEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + // TODO(zhiqiu): 80 has the same meaning for rocm and cuda? + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "rocblas fp16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( + handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r, + ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C, + rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo)); + }); +} + +template <> +template <> +inline void Blas::GEMM(CBLAS_TRANSPOSE transA, + CBLAS_TRANSPOSE transB, int M, int N, + int K, platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + // TODO(zhiqiu): 80 has the same meaning for rocm and cuda? + PADDLE_ENFORCE_GE( + context_.GetComputeCapability(), 80, + platform::errors::InvalidArgument( + "rocblas fp16 gemm requires GPU compute capability >= 80," + "but received %d", + context_.GetComputeCapability())); + + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( + handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r, + ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C, + rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo)); + }); +} + template <> template <> inline void Blas::GEMM( @@ -874,6 +948,39 @@ inline void Blas::GEMV(bool trans_a, int M, int N, } } +template <> +template <> +inline void Blas::GEMV( + bool trans_a, int M, int N, platform::bfloat16 alpha, + const platform::bfloat16 *A, const platform::bfloat16 *B, + platform::bfloat16 beta, platform::bfloat16 *C) const { + // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} +template <> +template <> +inline void Blas::GEMV(bool trans_a, int M, int N, + platform::bfloat16 alpha, + const platform::bfloat16 *A, + const platform::bfloat16 *B, + platform::bfloat16 beta, + platform::bfloat16 *C) const { + // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it. + if (trans_a) { + this->template GEMM(CblasNoTrans, CblasNoTrans, 1, N, M, + alpha, B, A, beta, C); + } else { + this->template GEMM(CblasNoTrans, CblasNoTrans, M, 1, N, + alpha, A, B, beta, C); + } +} + template <> template void Blas::BatchedGEMM( @@ -898,6 +1005,7 @@ void Blas::BatchedGEMM( ldc, strideC, batchCount); }); } + template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -925,6 +1033,66 @@ void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, }); } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + T alpha, const T *A, const T *B, T beta, T *C, int batchCount, + int64_t strideA, int64_t strideB) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + const int64_t strideC = M * N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::rocblas_gemm_strided_batched_ex( + handle, transb, transa, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r, + lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, + rocblas_datatype_f32_r, algo)); + }); +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + T alpha, const T *A, const T *B, T beta, T *C, int batchCount, + int64_t strideA, int64_t strideB) const { + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + const int64_t strideC = M * N; + rocblas_operation cuTransA = (transA == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + rocblas_operation cuTransB = (transB == CblasNoTrans) + ? rocblas_operation_none + : rocblas_operation_transpose; + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; + + context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { + PADDLE_ENFORCE_GPU_SUCCESS( + platform::dynload::rocblas_gemm_strided_batched_ex( + handle, transb, transa, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r, + lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, + rocblas_datatype_f32_r, algo)); + }); +} + template <> template void Blas::BatchedGEMM( @@ -935,6 +1103,7 @@ void Blas::BatchedGEMM( C[k]); } } + template <> template void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, @@ -973,6 +1142,32 @@ inline void Blas::BatchedGEMM( } } +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + +template <> +template <> +inline void Blas::BatchedGEMM( + CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, + platform::bfloat16 alpha, const platform::bfloat16 **A, + const platform::bfloat16 **B, platform::bfloat16 beta, + platform::bfloat16 **C, int batchCount) const { + for (int k = 0; k < batchCount; ++k) { + this->template GEMM(transA, transB, M, N, K, alpha, + A[k], B[k], beta, C[k]); + } +} + template <> template void Blas::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo, From 8402a9024a7cf8d91f5997db96d3033cdaf26d02 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Mon, 14 Feb 2022 20:58:12 +0800 Subject: [PATCH 07/10] fix error --- paddle/fluid/operators/math/blas_impl.cu.h | 13 +++++++++---- python/paddle/fluid/tests/unittests/op_test.py | 4 ++-- .../fluid/tests/unittests/test_matmul_v2_op.py | 15 +++++++++------ 3 files changed, 20 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 8912289a87870..2b280b9b9d6e1 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -1456,6 +1456,8 @@ inline void Blas::BatchedGEMM( cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; bool use_tensor_op_math = context_.tensor_core_available(); @@ -1466,8 +1468,8 @@ inline void Blas::BatchedGEMM( context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( - handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb, - strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc, + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, + strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc, strideC, batchCount, CUBLAS_COMPUTE_32F, algo)); }); #else @@ -1497,6 +1499,9 @@ inline void Blas::BatchedGEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; + float h_alpha = static_cast(alpha); + float h_beta = static_cast(beta); + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; bool use_tensor_op_math = context_.tensor_core_available(); if (use_tensor_op_math) { @@ -1506,8 +1511,8 @@ inline void Blas::BatchedGEMM( context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx( - handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb, - strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc, + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, + strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc, strideC, batchCount, CUBLAS_COMPUTE_32F, algo)); }); #else diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 754d7bd54b9f8..85423df3d3828 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -1658,7 +1658,7 @@ def check_grad_with_place(self, for grad in analytic_grads: if grad.dtype == np.uint16: grad = convert_uint16_to_float(grad) - max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error + max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error fp32_analytic_grads.append(grad) analytic_grads = fp32_analytic_grads @@ -1666,7 +1666,7 @@ def check_grad_with_place(self, for grad in numeric_grads: if grad.dtype == np.uint16: grad = convert_uint16_to_float(grad) - max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error + max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error fp32_numeric_grads.append(grad) numeric_grads = fp32_numeric_grads diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py index 83e60dcb322af..ed1495c6352bb 100644 --- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py @@ -80,13 +80,12 @@ def setUp(self): else: x = np.random.random(self.x_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype) - # -0.1 ~ 0.1 - x = -0.1 + 0.2 * x - y = -0.1 + 0.2 * y + # -0.1 ~ 0.1 + x = -0.1 + 0.2 * x + y = -0.1 + 0.2 * y result = reference_matmul(x, y, self.trans_x, self.trans_y) - result = result.astype(self.dtype) if self.is_bfloat16_op(): - output = output.astype(np.float32) + result = result.astype(np.float32) self.inputs = { 'X': convert_float_to_uint16(x), 'Y': convert_float_to_uint16(y), @@ -96,6 +95,7 @@ def setUp(self): 'Y': y, } else: + result = result.astype(self.dtype) self.inputs = { 'X': x, 'Y': y, @@ -391,7 +391,7 @@ def get_numeric_grad(self, place, check_name): op = create_op(scope, self.op_type, self.inputs, self.outputs, self.attrs) return get_numeric_gradient(place, scope, op, self.inputs_fp32, - check_name, ['Output']) + check_name, ['Out']) def init_kernel_type(self): self.dtype = np.uint16 @@ -418,6 +418,9 @@ def test_check_grad_y(self): no_grad_set=set(['X']), user_defined_grads=[numeric_grads]) + def test_check_grad(self): + pass + cls_name = "{0}_{1}".format(parent.__name__, "Bf16") TestMatMulOpBf16Case.__name__ = cls_name globals()[cls_name] = TestMatMulOpBf16Case From cff9fdb644c77304aba6d58f2bfb8e0826bb9f16 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Tue, 15 Feb 2022 11:42:36 +0800 Subject: [PATCH 08/10] fix rocm --- paddle/fluid/operators/math/blas_impl.hip.h | 40 ++++++++++++--------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h index 86293e21f164b..5f262990e8208 100644 --- a/paddle/fluid/operators/math/blas_impl.hip.h +++ b/paddle/fluid/operators/math/blas_impl.hip.h @@ -577,12 +577,14 @@ inline void Blas::GEMM( float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( - handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r, - ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C, - rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo)); + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, + C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N, + rocblas_datatype_f32_r, algo, 0, 0)); }); } @@ -615,12 +617,14 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); + rocblas_gemm_algo algo = rocblas_gemm_algo_standard; context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex( - handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r, - ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C, - rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo)); + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, + rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, + C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N, + rocblas_datatype_f32_r, algo, 0, 0)); }); } @@ -1037,8 +1041,9 @@ template <> template <> inline void Blas::BatchedGEMM( CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, - T alpha, const T *A, const T *B, T beta, T *C, int batchCount, - int64_t strideA, int64_t strideB) const { + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -1056,10 +1061,11 @@ inline void Blas::BatchedGEMM( context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::rocblas_gemm_strided_batched_ex( - handle, transb, transa, N, M, K, &h_alpha, B, + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r, - lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, - rocblas_datatype_f32_r, algo)); + lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C, + rocblas_datatype_bf16_r, ldc, strideC, batchCount, + rocblas_datatype_f32_r, algo, 0, 0)); }); } @@ -1067,8 +1073,9 @@ template <> template <> inline void Blas::BatchedGEMM( CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K, - T alpha, const T *A, const T *B, T beta, T *C, int batchCount, - int64_t strideA, int64_t strideB) const { + platform::bfloat16 alpha, const platform::bfloat16 *A, + const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C, + int batchCount, int64_t strideA, int64_t strideB) const { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -1086,10 +1093,11 @@ inline void Blas::BatchedGEMM( context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::rocblas_gemm_strided_batched_ex( - handle, transb, transa, N, M, K, &h_alpha, B, + handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r, - lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, - rocblas_datatype_f32_r, algo)); + lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C, + rocblas_datatype_bf16_r, ldc, strideC, batchCount, + rocblas_datatype_f32_r, algo, 0, 0)); }); } From 8c29718fea13a9966c0f714f83697353e66fd2df Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Tue, 15 Feb 2022 15:35:11 +0800 Subject: [PATCH 09/10] add bf16 header file --- paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h index 24725d9741cc3..63897bd671740 100644 --- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h +++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h @@ -16,6 +16,7 @@ limitations under the License. */ // NOTE(): support float16 to half in header file. #define PADDLE_CUDA_FP16 +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" From ef6a4d38f61d7dffe17c0abd9bbcb2786bbf8405 Mon Sep 17 00:00:00 2001 From: zhiqiu Date: Tue, 15 Feb 2022 16:58:47 +0800 Subject: [PATCH 10/10] fix compile --- paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h index 5aff7fbb221c5..58a25ae8d0e56 100644 --- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h +++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/pten/core/enforce.h" namespace paddle { namespace platform { @@ -70,8 +71,8 @@ __forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask, return bfloat16(__shfl_down_sync(mask, static_cast(val), static_cast(delta), width)); #else - PADDLE_THROW(platform::errors::Unimplemented( - "__shfl_down_sync with bfloat16 is not supported on cuda <= 11")); + PADDLE_ENFORCE( + false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11."); #endif }