From 538765867edb49cf84511ed19c0301650f8d2af1 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Fri, 11 Feb 2022 19:45:22 +0800
Subject: [PATCH 01/10] pten matmul cuda kernel support bf16

---
 paddle/fluid/operators/math/blas_impl.cu.h    | 239 ++++++++++++++++++
 paddle/pten/kernels/gpu/matmul_kernel.cu      |   1 +
 .../unittests/npu/test_matmulv2_op_npu.py     |  68 ++---
 .../tests/unittests/test_matmul_v2_op.py      | 163 +++++++++---
 .../unittests/xpu/test_matmul_v2_op_xpu.py    |  36 +--
 5 files changed, 414 insertions(+), 93 deletions(-)
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 7ffd2a7ab2d84..12ae5190bc37a 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -813,6 +813,103 @@ inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 8000
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta,
+    platform::bfloat16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+#if CUDA_VERSION >= 11000
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A,
+        CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "cublas fp16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+#if CUDA_VERSION >= 11000
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmEx(
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb, A,
+        CUDA_R_16BF, lda, &h_beta, C, CUDA_R_16BF, N, CUDA_R_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmEx with bfloat16 is not supported on cuda <= 11"));
+
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -1208,6 +1305,42 @@ inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::bfloat16 alpha,
+    const platform::bfloat16 *A, const platform::bfloat16 *B,
+    platform::bfloat16 beta, platform::bfloat16 *C) const {
+  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
+  // it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+  // Because cublas doesn't support bfloat gemv, we use cublasHgemm to achieve
+  // it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -1306,6 +1439,86 @@ void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif  // CUDA_VERSION >= 9010
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+#if CUDA_VERSION >= 11000
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+        handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb,
+        strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc,
+        strideC, batchCount, CUBLAS_COMPUTE_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
+      "11"));
+#endif  // CUDA_VERSION >= 11000
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int64_t strideC = M * N;
+
+#if CUDA_VERSION >= 11000
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+  bool use_tensor_op_math = context_.tensor_core_available();
+  if (use_tensor_op_math) {
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  }
+  VLOG(5) << "use_tensor_op_math: " << (use_tensor_op_math ? "True" : "False");
+
+  context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
+        handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb,
+        strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc,
+        strideC, batchCount, CUBLAS_COMPUTE_32F, algo));
+  });
+#else
+  // raise error
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "cublasGemmStridedBatchedEx with bfloat16 is not supported on cuda <= "
+      "11"));
+#endif  // CUDA_VERSION >= 11000
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -1356,6 +1569,32 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,
diff --git a/paddle/pten/kernels/gpu/matmul_kernel.cu b/paddle/pten/kernels/gpu/matmul_kernel.cu
index f9fdbd27bf94e..b365581e949c1 100644
--- a/paddle/pten/kernels/gpu/matmul_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_kernel.cu
@@ -27,5 +27,6 @@ PT_REGISTER_KERNEL(matmul,
                    float,
                    double,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    paddle::platform::complex<float>,
                    paddle::platform::complex<double>) {}
diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
index 882043ef6eb91..23ca0cf1f492f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py
@@ -71,7 +71,7 @@ def test_check_grad(self):
         self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
 
 
-class TestMatMuklOp2(TestMatMulV2Op):
+class TestMatMulOp2(TestMatMulV2Op):
     """
     case 2
     """
@@ -83,7 +83,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOp3(TestMatMulV2Op):
+class TestMatMulOp3(TestMatMulV2Op):
     """
     case 3
     """
@@ -95,7 +95,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp4(TestMatMulV2Op):
+class TestMatMulOp4(TestMatMulV2Op):
     """
     case 4
     """
@@ -107,7 +107,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp5(TestMatMulV2Op):
+class TestMatMulOp5(TestMatMulV2Op):
     """
     case 5
     """
@@ -119,7 +119,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp6(TestMatMulV2Op):
+class TestMatMulOp6(TestMatMulV2Op):
     """
     case 6
     """
@@ -131,7 +131,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp7(TestMatMulV2Op):
+class TestMatMulOp7(TestMatMulV2Op):
     """
     case 7
     """
@@ -143,7 +143,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp8(TestMatMulV2Op):
+class TestMatMulOp8(TestMatMulV2Op):
     """
     case 8
     """
@@ -155,7 +155,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp9(TestMatMulV2Op):
+class TestMatMulOp9(TestMatMulV2Op):
     """
     case 9
     """
@@ -167,7 +167,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOp10(TestMatMulV2Op):
+class TestMatMulOp10(TestMatMulV2Op):
     """
     case 10
     """
@@ -179,7 +179,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp11(TestMatMulV2Op):
+class TestMatMulOp11(TestMatMulV2Op):
     """
     case 11
     """
@@ -191,7 +191,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp12(TestMatMulV2Op):
+class TestMatMulOp12(TestMatMulV2Op):
     """
     case 12
     """
@@ -203,7 +203,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp13(TestMatMulV2Op):
+class TestMatMulOp13(TestMatMulV2Op):
     """
     case 13
     """
@@ -215,7 +215,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp14(TestMatMulV2Op):
+class TestMatMulOp14(TestMatMulV2Op):
     """
     case 14_1
     """
@@ -227,7 +227,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp15(TestMatMulV2Op):
+class TestMatMulOp15(TestMatMulV2Op):
     """
     case 14_2
     """
@@ -239,7 +239,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp16(TestMatMulV2Op):
+class TestMatMulOp16(TestMatMulV2Op):
     """
     case 16 : to check the gradient for special case
     """
@@ -251,7 +251,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp17(TestMatMulV2Op):
+class TestMatMulOp17(TestMatMulV2Op):
     """
     case 17 : to check the gradient for special case
     """
@@ -263,7 +263,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+class TestMatMulOpBroadcast1(TestMatMulV2Op):
     """
     case 14_3
     """
@@ -275,7 +275,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+class TestMatMulOpBroadcast2(TestMatMulV2Op):
     """
     case 14_4
     """
@@ -310,22 +310,22 @@ def test_check_grad(self):
 
 
 create_test_fp16_class(TestMatMulV2Op)
-create_test_fp16_class(TestMatMuklOp2)
-create_test_fp16_class(TestMatMuklOp3)
-create_test_fp16_class(TestMatMuklOp4)
-create_test_fp16_class(TestMatMuklOp5)
-create_test_fp16_class(TestMatMuklOp6)
-create_test_fp16_class(TestMatMuklOp7)
-create_test_fp16_class(TestMatMuklOp8)
-create_test_fp16_class(TestMatMuklOp9)
-create_test_fp16_class(TestMatMuklOp10)
-create_test_fp16_class(TestMatMuklOp11)
-create_test_fp16_class(TestMatMuklOp12)
-create_test_fp16_class(TestMatMuklOp13)
-create_test_fp16_class(TestMatMuklOp14)
-create_test_fp16_class(TestMatMuklOp15)
-create_test_fp16_class(TestMatMuklOp16)
-create_test_fp16_class(TestMatMuklOp17)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp7)
+create_test_fp16_class(TestMatMulOp8)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+create_test_fp16_class(TestMatMulOp14)
+create_test_fp16_class(TestMatMulOp15)
+create_test_fp16_class(TestMatMulOp16)
+create_test_fp16_class(TestMatMulOp17)
 
 
 class TestMatMulV2API(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index efcc0e4cfe323..83e60dcb322af 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -16,7 +16,8 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient
+from paddle.fluid.tests.unittests.testsuite import create_op
 import paddle.fluid.core as core
 
 import paddle
@@ -73,17 +74,32 @@ def setUp(self):
         self.init_kernel_type()
         self.config()
         self.op_type = "matmul_v2"
-        x = np.random.random(self.x_shape).astype(self.dtype)
-        y = np.random.random(self.y_shape).astype(self.dtype)
+        if self.is_bfloat16_op():
+            x = np.random.random(self.x_shape).astype(np.float32)
+            y = np.random.random(self.y_shape).astype(np.float32)
+        else:
+            x = np.random.random(self.x_shape).astype(self.dtype)
+            y = np.random.random(self.y_shape).astype(self.dtype)
         # -0.1 ~ 0.1
         x = -0.1 + 0.2 * x
         y = -0.1 + 0.2 * y
         result = reference_matmul(x, y, self.trans_x, self.trans_y)
         result = result.astype(self.dtype)
-        self.inputs = {
-            'X': x,
-            'Y': y,
-        }
+        if self.is_bfloat16_op():
+            output = output.astype(np.float32)
+            self.inputs = {
+                'X': convert_float_to_uint16(x),
+                'Y': convert_float_to_uint16(y),
+            }
+            self.inputs_fp32 = {
+                'X': x,
+                'Y': y,
+            }
+        else:
+            self.inputs = {
+                'X': x,
+                'Y': y,
+            }
         self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y}
         self.outputs = {'Out': result}
 
@@ -97,7 +113,7 @@ def test_check_grad(self):
             self.check_grad(['X', 'Y'], 'Out')
 
 
-class TestMatMuklOp2(TestMatMulV2Op):
+class TestMatMulOp2(TestMatMulV2Op):
     """
     case 2
     """
@@ -109,7 +125,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOp3(TestMatMulV2Op):
+class TestMatMulOp3(TestMatMulV2Op):
     """
     case 3
     """
@@ -121,7 +137,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp4(TestMatMulV2Op):
+class TestMatMulOp4(TestMatMulV2Op):
     """
     case 4
     """
@@ -133,7 +149,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp5(TestMatMulV2Op):
+class TestMatMulOp5(TestMatMulV2Op):
     """
     case 5
     """
@@ -145,7 +161,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp6(TestMatMulV2Op):
+class TestMatMulOp6(TestMatMulV2Op):
     """
     case 6
     """
@@ -157,7 +173,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp7(TestMatMulV2Op):
+class TestMatMulOp7(TestMatMulV2Op):
     """
     case 7
     """
@@ -169,7 +185,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp8(TestMatMulV2Op):
+class TestMatMulOp8(TestMatMulV2Op):
     """
     case 8
     """
@@ -181,7 +197,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp9(TestMatMulV2Op):
+class TestMatMulOp9(TestMatMulV2Op):
     """
     case 9
     """
@@ -193,7 +209,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOp10(TestMatMulV2Op):
+class TestMatMulOp10(TestMatMulV2Op):
     """
     case 10
     """
@@ -205,7 +221,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp11(TestMatMulV2Op):
+class TestMatMulOp11(TestMatMulV2Op):
     """
     case 11
     """
@@ -217,7 +233,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp12(TestMatMulV2Op):
+class TestMatMulOp12(TestMatMulV2Op):
     """
     case 12
     """
@@ -229,7 +245,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp13(TestMatMulV2Op):
+class TestMatMulOp13(TestMatMulV2Op):
     """
     case 13
     """
@@ -241,7 +257,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp14(TestMatMulV2Op):
+class TestMatMulOp14(TestMatMulV2Op):
     """
     case 14_1
     """
@@ -253,7 +269,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp15(TestMatMulV2Op):
+class TestMatMulOp15(TestMatMulV2Op):
     """
     case 14_2
     """
@@ -265,7 +281,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp16(TestMatMulV2Op):
+class TestMatMulOp16(TestMatMulV2Op):
     """
     case 16 : to check the gradient for special case
     """
@@ -277,7 +293,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp17(TestMatMulV2Op):
+class TestMatMulOp17(TestMatMulV2Op):
     """
     case 17 : to check the gradient for special case
     """
@@ -289,7 +305,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+class TestMatMulOpBroadcast1(TestMatMulV2Op):
     """
     case 14_3
     """
@@ -301,7 +317,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+class TestMatMulOpBroadcast2(TestMatMulV2Op):
     """
     case 14_4
     """
@@ -343,22 +359,87 @@ def test_check_grad(self):
 
 
 create_test_fp16_class(TestMatMulV2Op)
-create_test_fp16_class(TestMatMuklOp2)
-create_test_fp16_class(TestMatMuklOp3)
-create_test_fp16_class(TestMatMuklOp4)
-create_test_fp16_class(TestMatMuklOp5)
-create_test_fp16_class(TestMatMuklOp6)
-create_test_fp16_class(TestMatMuklOp7)
-create_test_fp16_class(TestMatMuklOp8)
-create_test_fp16_class(TestMatMuklOp9)
-create_test_fp16_class(TestMatMuklOp10)
-create_test_fp16_class(TestMatMuklOp11)
-create_test_fp16_class(TestMatMuklOp12)
-create_test_fp16_class(TestMatMuklOp13)
-create_test_fp16_class(TestMatMuklOp14)
-create_test_fp16_class(TestMatMuklOp15)
-create_test_fp16_class(TestMatMuklOp16)
-create_test_fp16_class(TestMatMuklOp17)
+create_test_fp16_class(TestMatMulOp2)
+create_test_fp16_class(TestMatMulOp3)
+create_test_fp16_class(TestMatMulOp4)
+create_test_fp16_class(TestMatMulOp5)
+create_test_fp16_class(TestMatMulOp6)
+create_test_fp16_class(TestMatMulOp7)
+create_test_fp16_class(TestMatMulOp8)
+create_test_fp16_class(TestMatMulOp9)
+create_test_fp16_class(TestMatMulOp10)
+create_test_fp16_class(TestMatMulOp11)
+create_test_fp16_class(TestMatMulOp12)
+create_test_fp16_class(TestMatMulOp13)
+create_test_fp16_class(TestMatMulOp14)
+create_test_fp16_class(TestMatMulOp15)
+create_test_fp16_class(TestMatMulOp16)
+create_test_fp16_class(TestMatMulOp17)
+
+#--------------------test matmul bf16--------------------
+
+
+def create_test_bf16_class(parent, atol=0.01):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
+    )
+    class TestMatMulOpBf16Case(parent):
+        def get_numeric_grad(self, place, check_name):
+            scope = core.Scope()
+            self._check_grad_helper()
+            op = create_op(scope, self.op_type, self.inputs, self.outputs,
+                           self.attrs)
+            return get_numeric_gradient(place, scope, op, self.inputs_fp32,
+                                        check_name, ['Output'])
+
+        def init_kernel_type(self):
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+        def test_check_grad_x(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'X')
+            self.check_grad_with_place(
+                place, ['X'],
+                'Out',
+                no_grad_set=set(['Y']),
+                user_defined_grads=[numeric_grads])
+
+        def test_check_grad_y(self):
+            place = core.CUDAPlace(0)
+            numeric_grads = self.get_numeric_grad(place, 'Y')
+            self.check_grad_with_place(
+                place, ['Y'],
+                'Out',
+                no_grad_set=set(['X']),
+                user_defined_grads=[numeric_grads])
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Bf16")
+    TestMatMulOpBf16Case.__name__ = cls_name
+    globals()[cls_name] = TestMatMulOpBf16Case
+
+
+create_test_bf16_class(TestMatMulV2Op)
+create_test_bf16_class(TestMatMulOp2)
+create_test_bf16_class(TestMatMulOp3)
+create_test_bf16_class(TestMatMulOp4)
+create_test_bf16_class(TestMatMulOp5)
+create_test_bf16_class(TestMatMulOp6)
+create_test_bf16_class(TestMatMulOp7)
+create_test_bf16_class(TestMatMulOp8)
+create_test_bf16_class(TestMatMulOp9)
+create_test_bf16_class(TestMatMulOp10)
+create_test_bf16_class(TestMatMulOp11)
+create_test_bf16_class(TestMatMulOp12)
+create_test_bf16_class(TestMatMulOp13)
+create_test_bf16_class(TestMatMulOp14)
+create_test_bf16_class(TestMatMulOp15)
+create_test_bf16_class(TestMatMulOp16)
+create_test_bf16_class(TestMatMulOp17)
 
 
 class TestMatMulV2API(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
index 435026220c2b5..45d60c8538e09 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_matmul_v2_op_xpu.py
@@ -97,7 +97,7 @@ def test_check_grad(self):
         self.check_grad_with_place(place, ['X', 'Y'], 'Out')
 
 
-class TestMatMuklOp2(TestMatMulV2Op):
+class TestMatMulOp2(TestMatMulV2Op):
     """
     case 2
     """
@@ -109,7 +109,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp3(TestMatMulV2Op):
+class TestMatMulOp3(TestMatMulV2Op):
     """
     case 3
     """
@@ -121,7 +121,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp4(TestMatMulV2Op):
+class TestMatMulOp4(TestMatMulV2Op):
     """
     case 4
     """
@@ -133,7 +133,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp5(TestMatMulV2Op):
+class TestMatMulOp5(TestMatMulV2Op):
     """
     case 5
     """
@@ -145,7 +145,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp6(TestMatMulV2Op):
+class TestMatMulOp6(TestMatMulV2Op):
     """
     case 6
     """
@@ -157,7 +157,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp7(TestMatMulV2Op):
+class TestMatMulOp7(TestMatMulV2Op):
     """
     case 7
     """
@@ -169,7 +169,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp8(TestMatMulV2Op):
+class TestMatMulOp8(TestMatMulV2Op):
     """
     case 8
     """
@@ -181,7 +181,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp9(TestMatMulV2Op):
+class TestMatMulOp9(TestMatMulV2Op):
     """
     case 9
     """
@@ -193,7 +193,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOp10(TestMatMulV2Op):
+class TestMatMulOp10(TestMatMulV2Op):
     """
     case 10
     """
@@ -205,7 +205,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp11(TestMatMulV2Op):
+class TestMatMulOp11(TestMatMulV2Op):
     """
     case 11
     """
@@ -217,7 +217,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp12(TestMatMulV2Op):
+class TestMatMulOp12(TestMatMulV2Op):
     """
     case 12
     """
@@ -229,7 +229,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp13(TestMatMulV2Op):
+class TestMatMulOp13(TestMatMulV2Op):
     """
     case 13
     """
@@ -241,7 +241,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp14(TestMatMulV2Op):
+class TestMatMulOp14(TestMatMulV2Op):
     """
     case 14_1
     """
@@ -253,7 +253,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp15(TestMatMulV2Op):
+class TestMatMulOp15(TestMatMulV2Op):
     """
     case 14_2
     """
@@ -265,7 +265,7 @@ def config(self):
         self.trans_y = True
 
 
-class TestMatMuklOp16(TestMatMulV2Op):
+class TestMatMulOp16(TestMatMulV2Op):
     """
     case 16 : to check the big data
     """
@@ -277,7 +277,7 @@ def config(self):
         self.trans_y = False
 
 
-class TestMatMuklOp17(TestMatMulV2Op):
+class TestMatMulOp17(TestMatMulV2Op):
     """
     case 17 : to check the gradient for special case
     """
@@ -289,7 +289,7 @@ def config(self):
         self.trans_y = False
 
 
-# class TestMatMuklOpBroadcast1(TestMatMulV2Op):
+# class TestMatMulOpBroadcast1(TestMatMulV2Op):
 #     """
 #     case 14_3
 #     """
@@ -300,7 +300,7 @@ def config(self):
 #         self.trans_x = True
 #         self.trans_y = True
 
-# class TestMatMuklOpBroadcast2(TestMatMulV2Op):
+# class TestMatMulOpBroadcast2(TestMatMulV2Op):
 #     """
 #     case 14_4
 #     """

From f6254c97f97f9960d136a93299cb0b6871c37187 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 14 Feb 2022 12:12:45 +0800
Subject: [PATCH 02/10] fix pten kernel name

---
 paddle/fluid/framework/pten_utils.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc
index 265bd99593dc4..2048d5fd353d8 100644
--- a/paddle/fluid/framework/pten_utils.cc
+++ b/paddle/fluid/framework/pten_utils.cc
@@ -185,8 +185,9 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() {
 }
 
 KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() {
-  return KernelSignature(op_proto_->type(), GetInputArgsNames(),
-                         GetAttrsArgsNames(), GetOutputArgsNames());
+  return KernelSignature(pten::TransToPtenKernelName(op_proto_->type()),
+                         GetInputArgsNames(), GetAttrsArgsNames(),
+                         GetOutputArgsNames());
 }
 
 std::once_flag kernel_sig_map_init_flag;

From cd420d0404e82d1dd1af78db06f8f3c486df4a80 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 14 Feb 2022 12:13:26 +0800
Subject: [PATCH 03/10] add matmul_grad bf16 kernel

---
 .../platform/device/gpu/cuda/cuda_device_function.h   | 11 +++++++++++
 paddle/pten/kernels/gpu/matmul_grad_kernel.cu         |  1 +
 2 files changed, 12 insertions(+)

diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index cd78a89088cc6..caa40d555fa20 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -61,6 +62,16 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                   static_cast<unsigned>(delta), width));
 }
 
+#if defined(PADDLE_CUDA_BF16)
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask,
+                                                        bfloat16 val, int delta,
+                                                        int width) {
+  return bfloat16(__shfl_down_sync(mask, static_cast<nv_bfloat16>(val),
+                                   static_cast<unsigned>(delta), width));
+}
+#endif
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
     unsigned mask, paddle::platform::complex<float> val, int delta, int width) {
diff --git a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
index 31c44673f94e7..7df99260aa161 100644
--- a/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
+++ b/paddle/pten/kernels/gpu/matmul_grad_kernel.cu
@@ -26,6 +26,7 @@ PT_REGISTER_KERNEL(matmul_grad,
                    float,
                    double,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    paddle::platform::complex<float>,
                    paddle::platform::complex<double>) {}
 

From 7a3bcc87c39d7126eb1a96ae9a5eea58d3c788c5 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 14 Feb 2022 13:24:37 +0800
Subject: [PATCH 04/10] add emptylike bf16 kernel

---
 paddle/pten/kernels/empty_kernel.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/pten/kernels/empty_kernel.cc b/paddle/pten/kernels/empty_kernel.cc
index ecb058d35b909..e1a1788815ebf 100644
--- a/paddle/pten/kernels/empty_kernel.cc
+++ b/paddle/pten/kernels/empty_kernel.cc
@@ -94,6 +94,7 @@ PT_REGISTER_KERNEL(empty_like,
                    int64_t,
                    bool,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    paddle::platform::complex<float>,
                    paddle::platform::complex<double>) {}
 #endif

From f114ea0228530a7fc66d8838360c2a3580fe029f Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 14 Feb 2022 15:34:38 +0800
Subject: [PATCH 05/10] fix compile

---
 .../fluid/platform/device/gpu/cuda/cuda_device_function.h | 7 +++++--
 .../fluid/platform/device/gpu/rocm/rocm_device_function.h | 8 ++++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index caa40d555fa20..5aff7fbb221c5 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -62,15 +62,18 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                   static_cast<unsigned>(delta), width));
 }
 
-#if defined(PADDLE_CUDA_BF16)
 template <>
 __forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask,
                                                         bfloat16 val, int delta,
                                                         int width) {
+#if defined(PADDLE_CUDA_BF16)
   return bfloat16(__shfl_down_sync(mask, static_cast<nv_bfloat16>(val),
                                    static_cast<unsigned>(delta), width));
-}
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "__shfl_down_sync with bfloat16 is not supported on cuda <= 11"));
 #endif
+}
 
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
index 13ffc2396946c..24725d9741cc3 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -59,6 +59,14 @@ __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                              static_cast<unsigned>(delta), width));
 }
 
+template <>
+__forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask,
+                                                        bfloat16 val, int delta,
+                                                        int width) {
+  return bfloat16(__shfl_down(static_cast<float>(val),
+                              static_cast<unsigned>(delta), width));
+}
+
 template <>
 __forceinline__ __device__ paddle::platform::complex<float> CudaShuffleDownSync(
     unsigned mask, paddle::platform::complex<float> val, int delta, int width) {

From ef557ea9abfc19f35f542d49c7145f15f4e8c74b Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 14 Feb 2022 17:05:25 +0800
Subject: [PATCH 06/10] suppport rocm

---
 paddle/fluid/operators/math/blas_impl.cu.h  |  11 +-
 paddle/fluid/operators/math/blas_impl.hip.h | 195 ++++++++++++++++++++
 2 files changed, 200 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 12ae5190bc37a..8912289a87870 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -820,6 +820,7 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
     platform::bfloat16 alpha, const platform::bfloat16 *A,
     const platform::bfloat16 *B, platform::bfloat16 beta,
     platform::bfloat16 *C) const {
+#if CUDA_VERSION >= 11000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -840,7 +841,6 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
 
-#if CUDA_VERSION >= 11000
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
   bool use_tensor_op_math = context_.tensor_core_available();
   if (use_tensor_op_math) {
@@ -869,6 +869,7 @@ inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                                          const platform::bfloat16 *B,
                                          platform::bfloat16 beta,
                                          platform::bfloat16 *C) const {
+#if CUDA_VERSION >= 11000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -878,18 +879,16 @@ inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
 
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
   PADDLE_ENFORCE_GE(
       context_.GetComputeCapability(), 80,
       platform::errors::InvalidArgument(
-          "cublas fp16 gemm requires GPU compute capability >= 80,"
+          "cublas bf16 gemm requires GPU compute capability >= 80,"
           "but received %d",
           context_.GetComputeCapability()));
 
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
 
-#if CUDA_VERSION >= 11000
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DEFAULT;
   bool use_tensor_op_math = context_.tensor_core_available();
   if (use_tensor_op_math) {
@@ -1446,6 +1445,7 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
     platform::bfloat16 alpha, const platform::bfloat16 *A,
     const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
     int batchCount, int64_t strideA, int64_t strideB) const {
+#if CUDA_VERSION >= 11000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -1457,7 +1457,6 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int64_t strideC = M * N;
 
-#if CUDA_VERSION >= 11000
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
   bool use_tensor_op_math = context_.tensor_core_available();
   if (use_tensor_op_math) {
@@ -1486,6 +1485,7 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
     platform::bfloat16 alpha, const platform::bfloat16 *A,
     const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
     int batchCount, int64_t strideA, int64_t strideB) const {
+#if CUDA_VERSION >= 11000
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
   int lda = (transA == CblasNoTrans) ? K : M;
@@ -1497,7 +1497,6 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int64_t strideC = M * N;
 
-#if CUDA_VERSION >= 11000
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
   bool use_tensor_op_math = context_.tensor_core_available();
   if (use_tensor_op_math) {
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index bf7d66f485327..86293e21f164b 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -550,6 +550,80 @@ inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
       rocblas_datatype_f16_r, N, rocblas_datatype_f32_r);
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta,
+    platform::bfloat16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  // TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "rocblas fp16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+        handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r,
+        ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C,
+        rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo));
+  });
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
+                                         CBLAS_TRANSPOSE transB, int M, int N,
+                                         int K, platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  // TODO(zhiqiu): 80 has the same meaning for rocm and cuda?
+  PADDLE_ENFORCE_GE(
+      context_.GetComputeCapability(), 80,
+      platform::errors::InvalidArgument(
+          "rocblas fp16 gemm requires GPU compute capability >= 80,"
+          "but received %d",
+          context_.GetComputeCapability()));
+
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
+        handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r,
+        ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C,
+        rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo));
+  });
+}
+
 template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::GEMM(
@@ -874,6 +948,39 @@ inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMV(
+    bool trans_a, int M, int N, platform::bfloat16 alpha,
+    const platform::bfloat16 *A, const platform::bfloat16 *B,
+    platform::bfloat16 beta, platform::bfloat16 *C) const {
+  // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+template <>
+template <>
+inline void Blas<pten::GPUContext>::GEMV(bool trans_a, int M, int N,
+                                         platform::bfloat16 alpha,
+                                         const platform::bfloat16 *A,
+                                         const platform::bfloat16 *B,
+                                         platform::bfloat16 beta,
+                                         platform::bfloat16 *C) const {
+  // Because rocblas doesn't support bfloat16 gemv, we use gemmex to achieve it.
+  if (trans_a) {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, 1, N, M,
+                                            alpha, B, A, beta, C);
+  } else {
+    this->template GEMM<platform::bfloat16>(CblasNoTrans, CblasNoTrans, M, 1, N,
+                                            alpha, A, B, beta, C);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -898,6 +1005,7 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                                   ldc, strideC, batchCount);
   });
 }
+
 template <>
 template <typename T>
 void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -925,6 +1033,66 @@ void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
   });
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
+    int64_t strideA, int64_t strideB) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  const int64_t strideC = M * N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::rocblas_gemm_strided_batched_ex(
+            handle, transb, transa, N, M, K, &h_alpha, B,
+            rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r,
+            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC,
+            rocblas_datatype_f32_r, algo));
+  });
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
+    int64_t strideA, int64_t strideB) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  const int64_t strideC = M * N;
+  rocblas_operation cuTransA = (transA == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  rocblas_operation cuTransB = (transB == CblasNoTrans)
+                                   ? rocblas_operation_none
+                                   : rocblas_operation_transpose;
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
+
+  context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        platform::dynload::rocblas_gemm_strided_batched_ex(
+            handle, transb, transa, N, M, K, &h_alpha, B,
+            rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r,
+            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC,
+            rocblas_datatype_f32_r, algo));
+  });
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::BatchedGEMM(
@@ -935,6 +1103,7 @@ void Blas<platform::CUDADeviceContext>::BatchedGEMM(
                            C[k]);
   }
 }
+
 template <>
 template <typename T>
 void Blas<pten::GPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -973,6 +1142,32 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
   }
 }
 
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
+template <>
+template <>
+inline void Blas<pten::GPUContext>::BatchedGEMM(
+    CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
+    platform::bfloat16 alpha, const platform::bfloat16 **A,
+    const platform::bfloat16 **B, platform::bfloat16 beta,
+    platform::bfloat16 **C, int batchCount) const {
+  for (int k = 0; k < batchCount; ++k) {
+    this->template GEMM<platform::bfloat16>(transA, transB, M, N, K, alpha,
+                                            A[k], B[k], beta, C[k]);
+  }
+}
+
 template <>
 template <typename T>
 void Blas<platform::CUDADeviceContext>::TRSM(CBLAS_SIDE side, CBLAS_UPLO uplo,

From 8402a9024a7cf8d91f5997db96d3033cdaf26d02 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Mon, 14 Feb 2022 20:58:12 +0800
Subject: [PATCH 07/10] fix error

---
 paddle/fluid/operators/math/blas_impl.cu.h        | 13 +++++++++----
 python/paddle/fluid/tests/unittests/op_test.py    |  4 ++--
 .../fluid/tests/unittests/test_matmul_v2_op.py    | 15 +++++++++------
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 8912289a87870..2b280b9b9d6e1 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -1456,6 +1456,8 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   cublasOperation_t cuTransB =
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int64_t strideC = M * N;
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
 
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
   bool use_tensor_op_math = context_.tensor_core_available();
@@ -1466,8 +1468,8 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
 
   context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
-        handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb,
-        strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc,
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb,
+        strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc,
         strideC, batchCount, CUBLAS_COMPUTE_32F, algo));
   });
 #else
@@ -1497,6 +1499,9 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
       (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   const int64_t strideC = M * N;
 
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+
   cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
   bool use_tensor_op_math = context_.tensor_core_available();
   if (use_tensor_op_math) {
@@ -1506,8 +1511,8 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
 
   context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cublasGemmStridedBatchedEx(
-        handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_16BF, ldb,
-        strideB, A, CUDA_R_16BF, lda, strideA, &beta, C, CUDA_R_16BF, ldc,
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16BF, ldb,
+        strideB, A, CUDA_R_16BF, lda, strideA, &h_beta, C, CUDA_R_16BF, ldc,
         strideC, batchCount, CUBLAS_COMPUTE_32F, algo));
   });
 #else
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 754d7bd54b9f8..85423df3d3828 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1658,7 +1658,7 @@ def check_grad_with_place(self,
         for grad in analytic_grads:
             if grad.dtype == np.uint16:
                 grad = convert_uint16_to_float(grad)
-                max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error
+                max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error
             fp32_analytic_grads.append(grad)
         analytic_grads = fp32_analytic_grads
 
@@ -1666,7 +1666,7 @@ def check_grad_with_place(self,
         for grad in numeric_grads:
             if grad.dtype == np.uint16:
                 grad = convert_uint16_to_float(grad)
-                max_relative_error = 0.03 if max_relative_error < 0.03 else max_relative_error
+                max_relative_error = 0.04 if max_relative_error < 0.04 else max_relative_error
             fp32_numeric_grads.append(grad)
         numeric_grads = fp32_numeric_grads
 
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index 83e60dcb322af..ed1495c6352bb 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -80,13 +80,12 @@ def setUp(self):
         else:
             x = np.random.random(self.x_shape).astype(self.dtype)
             y = np.random.random(self.y_shape).astype(self.dtype)
-        # -0.1 ~ 0.1
-        x = -0.1 + 0.2 * x
-        y = -0.1 + 0.2 * y
+            # -0.1 ~ 0.1
+            x = -0.1 + 0.2 * x
+            y = -0.1 + 0.2 * y
         result = reference_matmul(x, y, self.trans_x, self.trans_y)
-        result = result.astype(self.dtype)
         if self.is_bfloat16_op():
-            output = output.astype(np.float32)
+            result = result.astype(np.float32)
             self.inputs = {
                 'X': convert_float_to_uint16(x),
                 'Y': convert_float_to_uint16(y),
@@ -96,6 +95,7 @@ def setUp(self):
                 'Y': y,
             }
         else:
+            result = result.astype(self.dtype)
             self.inputs = {
                 'X': x,
                 'Y': y,
@@ -391,7 +391,7 @@ def get_numeric_grad(self, place, check_name):
             op = create_op(scope, self.op_type, self.inputs, self.outputs,
                            self.attrs)
             return get_numeric_gradient(place, scope, op, self.inputs_fp32,
-                                        check_name, ['Output'])
+                                        check_name, ['Out'])
 
         def init_kernel_type(self):
             self.dtype = np.uint16
@@ -418,6 +418,9 @@ def test_check_grad_y(self):
                 no_grad_set=set(['X']),
                 user_defined_grads=[numeric_grads])
 
+        def test_check_grad(self):
+            pass
+
     cls_name = "{0}_{1}".format(parent.__name__, "Bf16")
     TestMatMulOpBf16Case.__name__ = cls_name
     globals()[cls_name] = TestMatMulOpBf16Case

From cff9fdb644c77304aba6d58f2bfb8e0826bb9f16 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Tue, 15 Feb 2022 11:42:36 +0800
Subject: [PATCH 08/10] fix rocm

---
 paddle/fluid/operators/math/blas_impl.hip.h | 40 ++++++++++++---------
 1 file changed, 24 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index 86293e21f164b..5f262990e8208 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -577,12 +577,14 @@ inline void Blas<platform::CUDADeviceContext>::GEMM(
 
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
 
   context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
-        handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r,
-        ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C,
-        rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo));
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+        rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta,
+        C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N,
+        rocblas_datatype_f32_r, algo, 0, 0));
   });
 }
 
@@ -615,12 +617,14 @@ inline void Blas<pten::GPUContext>::GEMM(CBLAS_TRANSPOSE transA,
 
   float h_alpha = static_cast<float>(alpha);
   float h_beta = static_cast<float>(beta);
+  rocblas_gemm_algo algo = rocblas_gemm_algo_standard;
 
   context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::rocblas_gemm_ex(
-        handle, transb, transa, N, M, K, &h_alpha, B, rocblas_datatype_bf16_r,
-        ldb, A, rocblas_datatype_bf16_r, lda, &h_beta, C,
-        rocblas_datatype_bf16_r, N, rocblas_datatype_f32_r, algo));
+        handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
+        rocblas_datatype_bf16_r, ldb, A, rocblas_datatype_bf16_r, lda, &h_beta,
+        C, rocblas_datatype_bf16_r, N, C, rocblas_datatype_bf16_r, N,
+        rocblas_datatype_f32_r, algo, 0, 0));
   });
 }
 
@@ -1037,8 +1041,9 @@ template <>
 template <>
 inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
-    int64_t strideA, int64_t strideB) const {
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -1056,10 +1061,11 @@ inline void Blas<platform::CUDADeviceContext>::BatchedGEMM(
   context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::rocblas_gemm_strided_batched_ex(
-            handle, transb, transa, N, M, K, &h_alpha, B,
+            handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
             rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r,
-            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC,
-            rocblas_datatype_f32_r, algo));
+            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C,
+            rocblas_datatype_bf16_r, ldc, strideC, batchCount,
+            rocblas_datatype_f32_r, algo, 0, 0));
   });
 }
 
@@ -1067,8 +1073,9 @@ template <>
 template <>
 inline void Blas<pten::GPUContext>::BatchedGEMM(
     CBLAS_TRANSPOSE transA, CBLAS_TRANSPOSE transB, int M, int N, int K,
-    T alpha, const T *A, const T *B, T beta, T *C, int batchCount,
-    int64_t strideA, int64_t strideB) const {
+    platform::bfloat16 alpha, const platform::bfloat16 *A,
+    const platform::bfloat16 *B, platform::bfloat16 beta, platform::bfloat16 *C,
+    int batchCount, int64_t strideA, int64_t strideB) const {
   int lda = (transA == CblasNoTrans) ? K : M;
   int ldb = (transB == CblasNoTrans) ? N : K;
   int ldc = N;
@@ -1086,10 +1093,11 @@ inline void Blas<pten::GPUContext>::BatchedGEMM(
   context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) {
     PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::rocblas_gemm_strided_batched_ex(
-            handle, transb, transa, N, M, K, &h_alpha, B,
+            handle, cuTransB, cuTransA, N, M, K, &h_alpha, B,
             rocblas_datatype_bf16_r, ldb, strideB, A, rocblas_datatype_bf16_r,
-            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC,
-            rocblas_datatype_f32_r, algo));
+            lda, strideA, &h_beta, C, rocblas_datatype_bf16_r, ldc, strideC, C,
+            rocblas_datatype_bf16_r, ldc, strideC, batchCount,
+            rocblas_datatype_f32_r, algo, 0, 0));
   });
 }
 

From 8c29718fea13a9966c0f714f83697353e66fd2df Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Tue, 15 Feb 2022 15:35:11 +0800
Subject: [PATCH 09/10] add bf16 header file

---
 paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
index 24725d9741cc3..63897bd671740 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 // NOTE(): support float16 to half in header file.
 #define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 

From ef6a4d38f61d7dffe17c0abd9bbcb2786bbf8405 Mon Sep 17 00:00:00 2001
From: zhiqiu <chenqiuliang@baidu.com>
Date: Tue, 15 Feb 2022 16:58:47 +0800
Subject: [PATCH 10/10] fix compile

---
 paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
index 5aff7fbb221c5..58a25ae8d0e56 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/core/enforce.h"
 
 namespace paddle {
 namespace platform {
@@ -70,8 +71,8 @@ __forceinline__ __device__ bfloat16 CudaShuffleDownSync(unsigned mask,
   return bfloat16(__shfl_down_sync(mask, static_cast<nv_bfloat16>(val),
                                    static_cast<unsigned>(delta), width));
 #else
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "__shfl_down_sync with bfloat16 is not supported on cuda <= 11"));
+  PADDLE_ENFORCE(
+      false, "__shfl_down_sync with bfloat16 is not supported on cuda <= 11.");
 #endif
 }