Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

conv2d support bfloat16 #32221

Merged
merged 16 commits into from
Jun 2, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 30 additions & 12 deletions paddle/fluid/operators/conv_cudnn_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,13 @@ template <typename T>
class CUDNNConvOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if CUDNN_VERSION_MIN(8, 1, 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这个检查能不能放到一个公共的地方,比如CudnnDataType<bfloat16>里面?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CudnnDataType<bfloat16>里只能做编译期检查,这里直接改为cudnn8.1以下不添加bfloat16数据类型的Kernel。

PADDLE_ENFORCE_EQ(
std::is_same_v<T, platform::bfloat16>, true,
platform::errors::Unavailable(
"conv2d supports bfloat16 data type , but it only works when "
"the version of cudnn is larger than 8.1.0"));
#endif
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
Expand Down Expand Up @@ -362,6 +369,13 @@ template <typename T>
class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if CUDNN_VERSION_MIN(8, 1, 0)
PADDLE_ENFORCE_EQ(
std::is_same_v<T, platform::bfloat16>, true,
platform::errors::Unavailable(
"conv2d supports bfloat16 data type , but it only works when "
"the version of cudnn is larger than 8.1.0"));
#endif
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
Expand Down Expand Up @@ -812,6 +826,13 @@ template <typename T>
class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if CUDNN_VERSION_MIN(8, 1, 0)
PADDLE_ENFORCE_EQ(
std::is_same_v<T, platform::bfloat16>, true,
platform::errors::Unavailable(
"conv2d supports bfloat16 data type , but it only works when "
"the version of cudnn is larger than 8.1.0"));
#endif
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
PADDLE_ENFORCE_EQ(
platform::is_gpu_place(ctx.GetPlace()), true,
Expand Down Expand Up @@ -1363,14 +1384,7 @@ REGISTER_OP_KERNEL(
conv2d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
// ROCM has limit thread in depthwise_conv.cu and willl result in accuracy issue
// Use depthwise_conv2d in MIOPEN to resolve this issue
REGISTER_OP_KERNEL(depthwise_conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
REGISTER_OP_KERNEL(depthwise_conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);

REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad_grad,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
Expand All @@ -1389,22 +1403,26 @@ REGISTER_OP_KERNEL(
REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>,
paddle::operators::CUDNNConvOpKernel<plat::float16>);
paddle::operators::CUDNNConvOpKernel<plat::float16>,
paddle::operators::CUDNNConvOpKernel<plat::bfloat16>);
REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvGradOpKernel<float>,
paddle::operators::CUDNNConvGradOpKernel<double>,
paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
paddle::operators::CUDNNConvGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvGradOpKernel<plat::bfloat16>);
REGISTER_OP_KERNEL(
conv2d_grad_grad, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);

REGISTER_OP_CUDA_KERNEL(
depthwise_conv2d_grad_grad,
paddle::operators::CUDNNConvDoubleGradOpKernel<float>,
paddle::operators::CUDNNConvDoubleGradOpKernel<double>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>);
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::float16>,
paddle::operators::CUDNNConvDoubleGradOpKernel<plat::bfloat16>);

REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>,
Expand Down
5 changes: 5 additions & 0 deletions paddle/fluid/platform/cudnn_desc.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ inline cudnnDataType_t ToCudnnDataType(
case framework::proto::VarType::FP64:
type = CUDNN_DATA_DOUBLE;
break;
#if CUDNN_VERSION_MIN(8, 1, 0)
case framework::proto::VarType::BF16:
type = CUDNN_DATA_BFLOAT16;
break;
#endif
default:
break;
}
Expand Down
21 changes: 21 additions & 0 deletions paddle/fluid/platform/cudnn_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,27 @@ inline ActivationMode StringToActivationMode(const std::string& str) {
template <typename T>
class CudnnDataType;

template <>
class CudnnDataType<bfloat16> {
public:
// CUDNN_DATA_BFLOAT16 is not valid before cudnn8.1
#if CUDNN_VERSION_MIN(8, 1, 0)
static const cudnnDataType_t type = CUDNN_DATA_BFLOAT16;
#else
static const cudnnDataType_t type = CUDNN_DATA_HALF;
#endif
Copy link
Contributor

@AshburnLee AshburnLee Apr 23, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#else分支不需要吧。当cudnn版本 < 8.1时,整个class应该不被编译。所以是不是在class整体头尾分别加上#if和#endif。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

因为conv2d对于bfloat16需要编译成功,代码逻辑中CudnnDataType<bfloat16>部分会被实例化,如果将整个模板特化使用预处理会出现编译失败问题。

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

cudnn8.1版本以下也不该用half类型,应该直接挂掉。另外,你加的都是#ifdef做编译时判断,运行时判断也要加一下?可以参考(但也不完整,需要加下cudnn version的判断):

if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
platform::errors::InvalidArgument(
"float16 can only be used when CUDNN is used"));
}

using ScalingParamType = const float;
using BatchNormParamType = float;
static ScalingParamType* kOne() {
static ScalingParamType v = 1.0;
return &v;
}
static ScalingParamType* kZero() {
static ScalingParamType v = 0.0;
return &v;
}
};

template <>
class CudnnDataType<float16> {
public:
Expand Down
40 changes: 40 additions & 0 deletions python/paddle/fluid/tests/unittests/test_conv2d_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,37 @@ def test_check_grad_no_input(self):
globals()[cls_name] = TestConv2DCUDNNFp16


def create_test_cudnn_bf16_class(parent, grad_check=True):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

conv的测试不需要依赖OpTest单测框架的增强?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

需要的,目前已merge最新代码,同步OpTest单测框架改动。

@unittest.skipIf(
not core.is_compiled_with_cuda() or core.cudnn_version() < 8100,
"core is not compiled with CUDA and cudnn version need larger than 8.1.0"
)
class TestConv2DCUDNNBF16(parent):
def init_kernel_type(self):
self.use_cudnn = True
self.dtype = np.uint16

def test_check_output(self):
place = core.CUDAPlace(0)
self.check_output_with_place(place, atol=1e-2)

def test_check_grad_no_filter(self):
place = core.CUDAPlace(0)
if grad_check:
self.check_grad_with_place(
place, ['Input'], 'Output', no_grad_set=set(['Filter']))

def test_check_grad_no_input(self):
place = core.CUDAPlace(0)
if grad_check:
self.check_grad_with_place(
place, ['Filter'], 'Output', no_grad_set=set(['Input']))

cls_name = "{0}_{1}".format(parent.__name__, "CUDNNBF16")
TestConv2DCUDNNBF16.__name__ = cls_name
globals()[cls_name] = TestConv2DCUDNNBF16


def create_test_channel_last_class(parent):
class TestChannelLastCase(parent):
def init_data_format(self):
Expand Down Expand Up @@ -554,6 +585,15 @@ def init_group(self):
create_test_cudnn_fp16_class(TestWith1x1, grad_check=False)
create_test_cudnn_fp16_class(TestWithInput1x1Filter1x1, grad_check=False)

#----------------Conv2DCUDNN bf16----------------

create_test_cudnn_bf16_class(TestConv2DOp, grad_check=False)
create_test_cudnn_bf16_class(TestWithPad, grad_check=False)
create_test_cudnn_bf16_class(TestWithStride, grad_check=False)
create_test_cudnn_bf16_class(TestWithGroup, grad_check=False)
create_test_cudnn_bf16_class(TestWith1x1, grad_check=False)
create_test_cudnn_bf16_class(TestWithInput1x1Filter1x1, grad_check=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

都不检查梯度?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

之前试参考cpu上bf16测试,重新commit代码已默认添加反向测试。


#----------------TestDepthwiseConv -----


Expand Down