PaddlePaddle · Xreki · Jun 2, 2021 · Apr 13, 2021 · Apr 22, 2021 · Apr 22, 2021
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -51,6 +51,13 @@ template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    PADDLE_ENFORCE_EQ(
+        std::is_same_v<T, platform::bfloat16>, true,
+        platform::errors::Unavailable(
+            "conv2d supports bfloat16 data type , but it only works when "
+            "the version of cudnn is larger than 8.1.0"));
+#endif
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()), true,
@@ -362,6 +369,13 @@ template <typename T>
 class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    PADDLE_ENFORCE_EQ(
+        std::is_same_v<T, platform::bfloat16>, true,
+        platform::errors::Unavailable(
+            "conv2d supports bfloat16 data type , but it only works when "
+            "the version of cudnn is larger than 8.1.0"));
+#endif
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()), true,
@@ -812,6 +826,13 @@ template <typename T>
 class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    PADDLE_ENFORCE_EQ(
+        std::is_same_v<T, platform::bfloat16>, true,
+        platform::errors::Unavailable(
+            "conv2d supports bfloat16 data type , but it only works when "
+            "the version of cudnn is larger than 8.1.0"));
+#endif
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()), true,
@@ -1363,14 +1384,7 @@ REGISTER_OP_KERNEL(
     conv2d_grad_grad, CUDNN, plat::CUDAPlace,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
-// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
-// Use depthwise_conv2d in MIOPEN to resolve this issue
-REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvOpKernel<float>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
-REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
-                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
+
 REGISTER_OP_CUDA_KERNEL(
     depthwise_conv2d_grad_grad,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
@@ -1389,22 +1403,26 @@ REGISTER_OP_KERNEL(
 REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,
                    paddle::operators::CUDNNConvOpKernel<double>,
-                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
+                   paddle::operators::CUDNNConvOpKernel<plat::float16>,
+                   paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvGradOpKernel<float>,
                    paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
+                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
+                   paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
 REGISTER_OP_KERNEL(
     conv2d_grad_grad, CUDNN, plat::CUDAPlace,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
 
 REGISTER_OP_CUDA_KERNEL(
     depthwise_conv2d_grad_grad,
     paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
     paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
-    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
+    paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);
 
 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                    paddle::operators::CUDNNConvOpKernel<float>,

diff --git a/paddle/fluid/platform/cudnn_desc.h b/paddle/fluid/platform/cudnn_desc.h
@@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType(
     case framework::proto::VarType::FP64:
       type = CUDNN_DATA_DOUBLE;
       break;
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    case framework::proto::VarType::BF16:
+      type = CUDNN_DATA_BFLOAT16;
+      break;
+#endif
     default:
       break;
   }

diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
@@ -131,6 +131,27 @@ inline ActivationMode StringToActivationMode(const std::string& str) {
 template <typename T>
 class CudnnDataType;
 
+template <>
+class CudnnDataType<bfloat16> {
+ public:
+// CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1
+#if CUDNN_VERSION_MIN(8, 1, 0)
+  static const cudnnDataType_t type = CUDNN_DATA_BFLOAT16;
+#else
+  static const cudnnDataType_t type = CUDNN_DATA_HALF;
+#endif
 if (input_data_type == framework::proto::VarType::FP16) { 
   PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, 
                     platform::errors::InvalidArgument( 
                         "float16 can only be used when CUDNN is used")); 
 } 
 if (input_data_type == framework::proto::VarType::FP16) { 
   PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, 
                     platform::errors::InvalidArgument( 
                         "float16 can only be used when CUDNN is used")); 
 } 
+  using ScalingParamType = const float;
+  using BatchNormParamType = float;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
 template <>
 class CudnnDataType<float16> {
  public:

diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -167,6 +167,37 @@ def test_check_grad_no_input(self):
     globals()[cls_name] = TestConv2DCUDNNFp16
 
 
+def create_test_cudnn_bf16_class(parent, grad_check=True):
+    @unittest.skipIf(
+        not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
+        "core is not compiled with CUDA and cudnn version need larger than 8.1.0"
+    )
+    class TestConv2DCUDNNBF16(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = True
+            self.dtype = np.uint16
+
+        def test_check_output(self):
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-2)
+
+        def test_check_grad_no_filter(self):
+            place = core.CUDAPlace(0)
+            if grad_check:
+                self.check_grad_with_place(
+                    place, ['Input'], 'Output', no_grad_set=set(['Filter']))
+
+        def test_check_grad_no_input(self):
+            place = core.CUDAPlace(0)
+            if grad_check:
+                self.check_grad_with_place(
+                    place, ['Filter'], 'Output', no_grad_set=set(['Input']))
+
+    cls_name = "{0}_{1}".format(parent.__name__, "CUDNNBF16")
+    TestConv2DCUDNNBF16.__name__ = cls_name
+    globals()[cls_name] = TestConv2DCUDNNBF16
+
+
 def create_test_channel_last_class(parent):
     class TestChannelLastCase(parent):
         def init_data_format(self):
@@ -554,6 +585,15 @@ def init_group(self):
 create_test_cudnn_fp16_class(TestWith1x1, grad_check=False)
 create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False)
 
+#----------------Conv2DCUDNN bf16----------------
+
+create_test_cudnn_bf16_class(TestConv2DOp, grad_check=False)
+create_test_cudnn_bf16_class(TestWithPad, grad_check=False)
+create_test_cudnn_bf16_class(TestWithStride, grad_check=False)
+create_test_cudnn_bf16_class(TestWithGroup, grad_check=False)
+create_test_cudnn_bf16_class(TestWith1x1, grad_check=False)
+create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1, grad_check=False)
+
 #----------------TestDepthwiseConv -----