diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index c80fda30990ff..91fe7dae50b54 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -34,25 +34,6 @@ class Scope; } // namespace framework } // namespace paddle -namespace { - -template -void ConvertTensorType(phi::DenseTensor* tensor) { - phi::DenseTensor tmp_tensor; - tmp_tensor.set_type(phi::CppTypeToDataType::Type()); - tmp_tensor.Resize(tensor->dims()); - auto* tmp_data = tmp_tensor.mutable_data(paddle::platform::CPUPlace()); - auto* data = tensor->mutable_data(paddle::platform::CPUPlace()); - for (int i = 0; i < tensor->numel(); i++) { - tmp_data[i] = static_cast(data[i]); - } - tensor->clear(); - paddle::framework::TensorCopySync( - tmp_tensor, paddle::platform::CPUPlace(), tensor); -} - -} // namespace - namespace paddle { namespace framework { namespace ir { @@ -450,7 +431,7 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, // conv_filter fp16 --> fp32 auto tensor_type = filter_t->dtype(); if (tensor_type == phi::DataType::FLOAT16) { - ConvertTensorType(filter_t); + CastToFp32(filter_t, nullptr); } auto filter_dims = filter_t->dims(); bool has_bias = with_bn || with_conv_bias; @@ -529,9 +510,6 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, } } } - if (tensor_type == phi::DataType::FLOAT16) { - ConvertTensorType(filter_t); - } // filter max Node* filter_int16 = nullptr; Node* filter_max = nullptr; diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 9bde77ec8bf0b..4c3688b0badfa 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -164,7 +164,8 @@ XPUOpMap& get_kl2_ops() { {"conv3d", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"conv2d_transpose_grad", XPUKernelSet({phi::DataType::FLOAT32})}, - {"conv2d_transpose", XPUKernelSet({phi::DataType::FLOAT32})}, + {"conv2d_transpose", + XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"cumsum", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16, diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc index 9ff9bfc933a48..21b02412d31ca 100644 --- a/paddle/phi/kernels/reduce_mean_kernel.cc +++ b/paddle/phi/kernels/reduce_mean_kernel.cc @@ -67,5 +67,6 @@ PD_REGISTER_KERNEL( #endif #if defined(PADDLE_WITH_XPU) -PD_REGISTER_KERNEL(mean, XPU, ALL_LAYOUT, phi::MeanKernel, float) {} +PD_REGISTER_KERNEL( + mean, XPU, ALL_LAYOUT, phi::MeanKernel, float, phi::dtype::float16) {} #endif diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc index 41a1cceb11935..cc2c76c93a387 100644 --- a/paddle/phi/kernels/xpu/activation_kernel.cc +++ b/paddle/phi/kernels/xpu/activation_kernel.cc @@ -195,13 +195,6 @@ void PowKernel(const Context& dev_ctx, const DenseTensor& x, const Scalar& factor, DenseTensor* out) { - // using XPUType = typename XPUTypeTrait::Type; - // // dev_ctx.template Alloc(out); - // auto pow_factor = factor.to(); - // const auto* x_data = reinterpret_cast(x.data()); - // auto* y_data = reinterpret_cast(dev_ctx.template Alloc(out)); - // // const T* x_data = x.data(); - // // T* y_data = out->data(); dev_ctx.template Alloc(out); float pow_factor = factor.to(); const T* x_data = x.data(); @@ -578,16 +571,11 @@ PD_REGISTER_KERNEL( PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel) // no grad PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel) -// PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel) -// PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel) PD_REGISTER_ACTIVATION_KERNEL(hardswish, HardSwishKernel) PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel) PD_REGISTER_ACTIVATION_KERNEL(pow, PowKernel) PD_REGISTER_ACTIVATION_KERNEL(reciprocal, ReciprocalKernel) PD_REGISTER_ACTIVATION_KERNEL(relu6_raw, Relu6RawKernel) -// PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel) -// PD_REGISTER_ACTIVATION_KERNEL(sqrt, SqrtKernel) -// PD_REGISTER_ACTIVATION_KERNEL(swish_raw, SwishRawKernel) PD_REGISTER_ACTIVATION_KERNEL(softplus, SoftplusKernel) PD_REGISTER_ACTIVATION_KERNEL(sin, SinKernel) PD_REGISTER_ACTIVATION_KERNEL(cos, CosKernel) diff --git a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc index dec56dab994fc..1b3c31f665c7c 100644 --- a/paddle/phi/kernels/xpu/conv_transpose_kernel.cc +++ b/paddle/phi/kernels/xpu/conv_transpose_kernel.cc @@ -123,11 +123,11 @@ void Conv2dTransposeKernel(const Context& ctx, true); PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_transpose_v2"); } else { - int r = xpu::conv2d_transpose_v2( + int r = xpu::conv2d_transpose_v2( ctx.x_context(), - x.data(), - filter_.data(), - out->data(), + reinterpret_cast(x.data()), + reinterpret_cast(filter.data()), + reinterpret_cast(out->data()), batch_size, img_yc, img_xh, @@ -148,5 +148,9 @@ void Conv2dTransposeKernel(const Context& ctx, } // namespace phi -PD_REGISTER_KERNEL( - conv2d_transpose, XPU, ALL_LAYOUT, phi::Conv2dTransposeKernel, float) {} +PD_REGISTER_KERNEL(conv2d_transpose, + XPU, + ALL_LAYOUT, + phi::Conv2dTransposeKernel, + float, + phi::dtype::float16) {} diff --git a/test/xpu/test_batch_norm_op_xpu.py b/test/xpu/test_batch_norm_op_xpu.py index 6cf666c8094c9..c7a6efecad406 100644 --- a/test/xpu/test_batch_norm_op_xpu.py +++ b/test/xpu/test_batch_norm_op_xpu.py @@ -154,7 +154,6 @@ def __init__(self): class TestBatchNormOp(unittest.TestCase): def setUp(self): self.op_type = "batch_norm" - self.dtype = np.float32 self.shape = [2, 3, 4, 5] self.data_layout = "NCHW" self.epsilon = 1e-05 @@ -162,6 +161,9 @@ def setUp(self): self.init_dtype() self.set_xpu() self.set_attrs() + self.rtol = 1e-5 + if self.dtype == np.float16: + self.rtol = 1e-2 if self.data_layout == "NHWC": channel_size = self.shape[3] @@ -175,15 +177,15 @@ def setUp(self): np.random.seed(1024) self.x_np = np.random.random_sample(self.shape).astype(self.dtype) self.scale_np = np.random.random_sample([channel_size]).astype( - self.dtype + np.float32 ) self.bias_np = np.random.random_sample([channel_size]).astype( - self.dtype + np.float32 ) - self.mean_np = np.zeros([channel_size]).astype(self.dtype) - self.variance_np = np.ones([channel_size]).astype(self.dtype) - self.saved_mean_np = np.zeros([channel_size]).astype(self.dtype) - self.saved_variance_np = np.ones([channel_size]).astype(self.dtype) + self.mean_np = np.zeros([channel_size]).astype(np.float32) + self.variance_np = np.ones([channel_size]).astype(np.float32) + self.saved_mean_np = np.zeros([channel_size]).astype(np.float32) + self.saved_variance_np = np.ones([channel_size]).astype(np.float32) def set_attrs(self): pass @@ -244,7 +246,110 @@ def test_infer(self): self.epsilon, self.data_layout, ) - np.testing.assert_allclose(y_np_ref, y_np, rtol=1e-05) + np.testing.assert_allclose(y_np_ref, y_np, rtol=self.rtol) + + class TestBatchNormOpUseGlobalStats(unittest.TestCase): + def setUp(self): + self.places = [paddle.XPUPlace(0)] + self.init_test() + + # train mode + def init_test(self): + self.use_global_stats = True + self.trainable_statistics = False + + def test_global_stats(self): + for p in self.places: + with fluid.dygraph.guard(p): + x = paddle.randn([2, 6, 6, 4]) + net1 = paddle.nn.BatchNorm( + 6, + param_attr=fluid.ParamAttr( + initializer=paddle.nn.initializer.Constant(1.0) + ), + use_global_stats=self.use_global_stats, + trainable_statistics=self.trainable_statistics, + ) + net2 = paddle.nn.BatchNorm2D( + 6, use_global_stats=self.use_global_stats + ) + net2.weight = net1.weight + net2.bias = net1.bias + if self.trainable_statistics: + net1.training = False + net2.training = False + y1 = net1(x) + y2 = net2(x) + np.testing.assert_allclose( + y1.numpy(), y2.numpy(), rtol=1e-5 + ) + + class TestBatchNormOpUseGlobalStats1(TestBatchNormOpUseGlobalStats): + # test mode + def init_test(self): + self.use_global_stats = True + self.trainable_statistics = True + + class TestBatchNormUseGlobalStats2(TestBatchNormOpUseGlobalStats): + # train mode + def init_test(self): + self.use_global_stats = True + self.trainable_statistics = False + + +support_types = get_xpu_op_support_types('batch_norm') +for stype in support_types: + create_test_class(globals(), XPUTestBatchNormOp, stype) + + +class XPUTestBatchNormGradOp(XPUOpTestWrapper): + def __init__(self): + self.op_name = 'batch_norm' + self.use_dynamic_create_class = False + + class TestBatchNormGradOp(unittest.TestCase): + def setUp(self): + self.op_type = "batch_norm" + self.shape = [2, 3, 4, 5] + self.data_layout = "NCHW" + self.epsilon = 1e-05 + self.momentum = 0.9 + self.init_dtype() + self.set_xpu() + self.set_attrs() + + if self.data_layout == "NHWC": + channel_size = self.shape[3] + elif self.data_layout == "NCHW": + channel_size = self.shape[1] + else: + raise ValueError( + "Unsupported data layout! Only NCHW and NHWC is supported, but received " + + self.data_layout + ) + np.random.seed(1024) + self.x_np = np.random.random_sample(self.shape).astype(self.dtype) + self.scale_np = np.random.random_sample([channel_size]).astype( + np.float32 + ) + self.bias_np = np.random.random_sample([channel_size]).astype( + np.float32 + ) + self.mean_np = np.zeros([channel_size]).astype(np.float32) + self.variance_np = np.ones([channel_size]).astype(np.float32) + self.saved_mean_np = np.zeros([channel_size]).astype(np.float32) + self.saved_variance_np = np.ones([channel_size]).astype(np.float32) + + def set_attrs(self): + pass + + def init_dtype(self): + self.dtype = self.in_type + + def set_xpu(self): + self.__class__.use_xpu = True + self.__class__.op_type = self.in_type + self.place = paddle.XPUPlace(0) def test_train(self): y_grad_np = np.random.random_sample(self.shape).astype(self.dtype) @@ -349,58 +454,10 @@ def test_train(self): outputs[name], outs[id], rtol=1e-05, atol=1e-4 ) - class TestBatchNormOpUseGlobalStats(unittest.TestCase): - def setUp(self): - self.places = [paddle.XPUPlace(0)] - self.init_test() - - # train mode - def init_test(self): - self.use_global_stats = True - self.trainable_statistics = False - - def test_global_stats(self): - for p in self.places: - with fluid.dygraph.guard(p): - x = paddle.randn([2, 6, 6, 4]) - net1 = paddle.nn.BatchNorm( - 6, - param_attr=fluid.ParamAttr( - initializer=paddle.nn.initializer.Constant(1.0) - ), - use_global_stats=self.use_global_stats, - trainable_statistics=self.trainable_statistics, - ) - net2 = paddle.nn.BatchNorm2D( - 6, use_global_stats=self.use_global_stats - ) - net2.weight = net1.weight - net2.bias = net1.bias - if self.trainable_statistics: - net1.training = False - net2.training = False - y1 = net1(x) - y2 = net2(x) - np.testing.assert_allclose( - y1.numpy(), y2.numpy(), rtol=1e-05 - ) - - class TestBatchNormOpUseGlobalStats1(TestBatchNormOpUseGlobalStats): - # test mode - def init_test(self): - self.use_global_stats = True - self.trainable_statistics = True - - class TestBatchNormUseGlobalStats2(TestBatchNormOpUseGlobalStats): - # train mode - def init_test(self): - self.use_global_stats = True - self.trainable_statistics = False - -support_types = get_xpu_op_support_types('batch_norm') -for stype in support_types: - create_test_class(globals(), XPUTestBatchNormOp, stype) +support_types_grad = get_xpu_op_support_types('batch_norm_grad') +for stype_grad in support_types_grad: + create_test_class(globals(), XPUTestBatchNormGradOp, stype_grad) if __name__ == '__main__': unittest.main()