From c8becdd9326d841e2bf69413d137d44cb3bbbb7a Mon Sep 17 00:00:00 2001 From: loneranger <836253168@qq.com> Date: Sun, 12 Mar 2023 21:33:32 +0800 Subject: [PATCH 01/10] add fp16 and bfp16 for temporalshift --- .../kernels/gpu/temporal_shift_grad_kernel.cu | 4 +- .../phi/kernels/gpu/temporal_shift_kernel.cu | 4 +- .../phi/kernels/temporal_shift_grad_kernel.h | 1 + paddle/phi/kernels/temporal_shift_kernel.h | 1 + .../tests/unittests/test_temporal_shift_op.py | 96 +++++++++++++++++-- 5 files changed, 96 insertions(+), 10 deletions(-) diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu index cc5d95a12f7a3..ec20e0b523a13 100644 --- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/temporal_shift_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" @@ -146,4 +147,5 @@ PD_REGISTER_KERNEL(temporal_shift_grad, phi::TemporalShiftGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu index b321fad07ac1f..d83713f064f0a 100644 --- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu +++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu @@ -15,6 +15,7 @@ #include "paddle/phi/kernels/temporal_shift_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" @@ -146,4 +147,5 @@ PD_REGISTER_KERNEL(temporal_shift, phi::TemporalShiftKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} diff --git a/paddle/phi/kernels/temporal_shift_grad_kernel.h b/paddle/phi/kernels/temporal_shift_grad_kernel.h index 1bcd3d61c26f5..e91d08045ab88 100644 --- a/paddle/phi/kernels/temporal_shift_grad_kernel.h +++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" namespace phi { diff --git a/paddle/phi/kernels/temporal_shift_kernel.h b/paddle/phi/kernels/temporal_shift_kernel.h index a927d7fb23aae..7c85ffd9783aa 100644 --- a/paddle/phi/kernels/temporal_shift_kernel.h +++ b/paddle/phi/kernels/temporal_shift_kernel.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" namespace phi { diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py index 6b99e0ead0886..30c07b1ca5464 100644 --- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py +++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py @@ -15,7 +15,7 @@ import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle from paddle.fluid import core @@ -103,25 +103,105 @@ def initTestCase(self): @unittest.skipIf( - not core.is_compiled_with_cuda(), "core is not compiled with CUDA" + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the float16", ) -class TestTemporalShiftFP16(TestTemporalShift): +class TestTemporalShiftFP16OP(OpTest): def initTestCase(self): self.x_shape = (3, 10, 5, 5) self.seg_num = 1 self.shift_ratio = 0.3 - self.dtype = 'float16' + self.dtype = np.float16 self.data_format = 'NCHW' + def setUp(self): + self.initTestCase() + self.op_type = 'temporal_shift' + self.python_api = paddle.nn.functional.temporal_shift + self.__class__.op_type = self.op_type + x = np.random.random(self.x_shape).astype(np.float32) + + self.attrs = { + "seg_num": self.seg_num, + "shift_ratio": self.shift_ratio, + "data_format": self.data_format, + } + + self.inputs = { + "X": x.astype(self.dtype), + } + + output = temporal_shift( + x, self.seg_num, self.shift_ratio, self.data_format + ) + self.outputs = {"Out": output} + self.python_out_sig = ["Out"] + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, check_eager=True, atol=1e-3) + + def test_check_grad_ignore_uv(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['X'], + 'Out', + check_eager=True, + max_relative_error=1e-2, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the float16", +) +class TestTemporalShiftBF16(OpTest): + def initTestCase(self): + self.x_shape = (3, 10, 5, 5) + self.seg_num = 1 + self.shift_ratio = 0.3 + self.dtype = np.uint16 + self.data_format = 'NCHW' + + def setUp(self): + self.initTestCase() + self.op_type = 'temporal_shift' + self.python_api = paddle.nn.functional.temporal_shift + self.__class__.op_type = self.op_type + x = np.random.random(self.x_shape).astype(np.float32) + + self.attrs = { + "seg_num": self.seg_num, + "shift_ratio": self.shift_ratio, + "data_format": self.data_format, + } + + self.inputs = { + "X": convert_float_to_uint16(x), + } + + output = temporal_shift( + x, self.seg_num, self.shift_ratio, self.data_format + ) + self.outputs = {"Out": convert_float_to_uint16(output)} + self.python_out_sig = ["Out"] + def test_check_output(self): place = core.CUDAPlace(0) - if core.is_float16_supported(place): - self.check_output_with_place(place) + self.check_output_with_place(place, check_eager=True, atol=1e-3) def test_check_grad_ignore_uv(self): place = core.CUDAPlace(0) - if core.is_float16_supported(place): - self.check_grad_with_place(place, ['X'], 'Out') + self.check_grad_with_place( + place, + ['X'], + 'Out', + check_eager=True, + max_relative_error=1e-2, + ) class TestTemporalShiftAPI(unittest.TestCase): From 2207120533816d8697fb22f9ca25e0aa641f82d6 Mon Sep 17 00:00:00 2001 From: loneranger <836253168@qq.com> Date: Mon, 13 Mar 2023 12:23:23 +0800 Subject: [PATCH 02/10] add fp16 and bfp16 for complex --- paddle/phi/kernels/complex_grad_kernel.h | 1 + paddle/phi/kernels/complex_kernel.h | 1 + paddle/phi/kernels/gpu/complex_grad_kernel.cu | 12 ++- paddle/phi/kernels/gpu/complex_kernel.cu | 12 ++- .../fluid/tests/unittests/test_complex_op.py | 100 +++++++++++++++++- .../tests/unittests/test_temporal_shift_op.py | 4 +- 6 files changed, 120 insertions(+), 10 deletions(-) diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h index 91c47538e958d..393b39e562617 100644 --- a/paddle/phi/kernels/complex_grad_kernel.h +++ b/paddle/phi/kernels/complex_grad_kernel.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index ad66b890b3d5a..47639ed430427 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/common/complex.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu index b2a6e4117c075..0b722591bf3a0 100644 --- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/complex_grad_kernel.h" - #include "paddle/phi/common/complex.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h" @@ -36,7 +36,13 @@ PD_REGISTER_KERNEL(real_grad, kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } -PD_REGISTER_KERNEL( - complex_grad, GPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) { +PD_REGISTER_KERNEL(complex_grad, + GPU, + ALL_LAYOUT, + phi::ComplexGradKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); } diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index 5c5bf104128d3..4f6c609b8a44c 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/complex_kernel.h" - #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_kernel_impl.h" @@ -51,7 +51,13 @@ PD_REGISTER_KERNEL(imag, kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } -PD_REGISTER_KERNEL( - complex, GPU, ALL_LAYOUT, phi::ComplexKernel, float, double) { +PD_REGISTER_KERNEL(complex, + GPU, + ALL_LAYOUT, + phi::ComplexKernel, + float, + double, + phi::dtype::float16, + phi::dtype::bfloat16) { kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); } diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py index c769a85569820..8ed31fe99b6ac 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_op.py +++ b/python/paddle/fluid/tests/unittests/test_complex_op.py @@ -15,11 +15,11 @@ import unittest import numpy as np -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 import paddle from paddle import static -from paddle.fluid import dygraph +from paddle.fluid import core, dygraph paddle.enable_static() @@ -162,5 +162,101 @@ def test_static(self): np.testing.assert_allclose(self.out, out_np, rtol=1e-05) +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the float16", +) +class TestComplexFP16Op(OpTest): + def init_spec(self): + self.x_shape = [10, 10] + self.y_shape = [10, 10] + self.dtype = np.float16 + + def setUp(self): + self.op_type = "complex" + self.python_api = paddle.complex + self.init_spec() + self.__class__.op_type = self.op_type + x = np.random.randn(*self.x_shape).astype(np.float32) + y = np.random.randn(*self.y_shape).astype(np.float32) + out_ref = ref_complex(x, y).astype(np.float64) + self.out_grad = np.random.randn(*self.x_shape).astype( + np.float64 + ) + 1j * np.random.randn(*self.y_shape).astype(np.float64) + self.inputs = {'X': x.astype(self.dtype), 'Y': y.astype(self.dtype)} + self.outputs = {'Out': out_ref} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, check_eager=True, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + dout = self.out_grad + dx, dy = ref_complex_grad( + self.inputs['X'], self.inputs['Y'], self.out_grad + ) + self.check_grad( + place, + ['X', 'Y'], + 'Out', + user_defined_grads=[dx, dy], + user_defined_grad_outputs=[dout], + check_eager=True, + max_relative_error=1e-2, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestComplexBF16(OpTest): + def init_spec(self): + self.x_shape = [10, 10] + self.y_shape = [10, 10] + self.dtype = np.uint16 + + def setUp(self): + self.op_type = "complex" + self.python_api = paddle.complex + self.init_spec() + self.__class__.op_type = self.op_type + x = np.random.randn(*self.x_shape).astype(np.float32) + y = np.random.randn(*self.y_shape).astype(np.float32) + out_ref = ref_complex(x, y).astype(np.float64) + self.out_grad = convert_float_to_uint16( + np.random.randn(*self.x_shape).astype(np.float64) + + 1j * np.random.randn(*self.y_shape).astype(np.float64) + ) + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y), + } + self.outputs = {'Out': convert_float_to_uint16(out_ref)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, check_eager=True, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + dout = self.out_grad + dx, dy = ref_complex_grad( + self.inputs['X'], self.inputs['Y'], self.out_grad + ) + self.check_grad( + place, + ['X', 'Y'], + 'Out', + user_defined_grads=[dx, dy], + user_defined_grad_outputs=[dout], + check_eager=True, + max_relative_error=1e-2, + ) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py index 30c07b1ca5464..62591655a4508 100644 --- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py +++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py @@ -155,8 +155,8 @@ def test_check_grad_ignore_uv(self): @unittest.skipIf( not core.is_compiled_with_cuda() - or not core.is_float16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the float16", + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", ) class TestTemporalShiftBF16(OpTest): def initTestCase(self): From 95edb7e7e5780ccaf2e228f95b20b978dcce1431 Mon Sep 17 00:00:00 2001 From: loneranger <836253168@qq.com> Date: Mon, 13 Mar 2023 21:29:39 +0800 Subject: [PATCH 03/10] fix bug --- paddle/phi/kernels/complex_grad_kernel.h | 1 - paddle/phi/kernels/complex_kernel.h | 2 +- paddle/phi/kernels/gpu/complex_grad_kernel.cu | 11 +- paddle/phi/kernels/gpu/complex_kernel.cu | 11 +- .../fluid/tests/unittests/test_complex_op.py | 100 +----------------- 5 files changed, 7 insertions(+), 118 deletions(-) diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h index 393b39e562617..91c47538e958d 100644 --- a/paddle/phi/kernels/complex_grad_kernel.h +++ b/paddle/phi/kernels/complex_grad_kernel.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/dense_tensor.h" namespace phi { diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 47639ed430427..ac33e32826daa 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include "paddle/phi/common/complex.h" -#include "paddle/phi/common/data_type.h" + #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu index 0b722591bf3a0..cb9f527bcae13 100644 --- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu @@ -14,7 +14,6 @@ #include "paddle/phi/kernels/complex_grad_kernel.h" #include "paddle/phi/common/complex.h" -#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h" @@ -36,13 +35,7 @@ PD_REGISTER_KERNEL(real_grad, kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } -PD_REGISTER_KERNEL(complex_grad, - GPU, - ALL_LAYOUT, - phi::ComplexGradKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) { +PD_REGISTER_KERNEL( + complex_grad, GPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) { kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); } diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index 4f6c609b8a44c..2ea178f102297 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -14,7 +14,6 @@ #include "paddle/phi/kernels/complex_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_kernel_impl.h" @@ -51,13 +50,7 @@ PD_REGISTER_KERNEL(imag, kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } -PD_REGISTER_KERNEL(complex, - GPU, - ALL_LAYOUT, - phi::ComplexKernel, - float, - double, - phi::dtype::float16, - phi::dtype::bfloat16) { +PD_REGISTER_KERNEL( + complex, GPU, ALL_LAYOUT, phi::ComplexKernel, float, double) { kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); } diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py index 8ed31fe99b6ac..c769a85569820 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_op.py +++ b/python/paddle/fluid/tests/unittests/test_complex_op.py @@ -15,11 +15,11 @@ import unittest import numpy as np -from op_test import OpTest, convert_float_to_uint16 +from op_test import OpTest import paddle from paddle import static -from paddle.fluid import core, dygraph +from paddle.fluid import dygraph paddle.enable_static() @@ -162,101 +162,5 @@ def test_static(self): np.testing.assert_allclose(self.out, out_np, rtol=1e-05) -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_float16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the float16", -) -class TestComplexFP16Op(OpTest): - def init_spec(self): - self.x_shape = [10, 10] - self.y_shape = [10, 10] - self.dtype = np.float16 - - def setUp(self): - self.op_type = "complex" - self.python_api = paddle.complex - self.init_spec() - self.__class__.op_type = self.op_type - x = np.random.randn(*self.x_shape).astype(np.float32) - y = np.random.randn(*self.y_shape).astype(np.float32) - out_ref = ref_complex(x, y).astype(np.float64) - self.out_grad = np.random.randn(*self.x_shape).astype( - np.float64 - ) + 1j * np.random.randn(*self.y_shape).astype(np.float64) - self.inputs = {'X': x.astype(self.dtype), 'Y': y.astype(self.dtype)} - self.outputs = {'Out': out_ref} - - def test_check_output(self): - place = core.CUDAPlace(0) - self.check_output_with_place(place, check_eager=True, atol=1e-3) - - def test_check_grad(self): - place = core.CUDAPlace(0) - dout = self.out_grad - dx, dy = ref_complex_grad( - self.inputs['X'], self.inputs['Y'], self.out_grad - ) - self.check_grad( - place, - ['X', 'Y'], - 'Out', - user_defined_grads=[dx, dy], - user_defined_grad_outputs=[dout], - check_eager=True, - max_relative_error=1e-2, - ) - - -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_bfloat16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the bfloat16", -) -class TestComplexBF16(OpTest): - def init_spec(self): - self.x_shape = [10, 10] - self.y_shape = [10, 10] - self.dtype = np.uint16 - - def setUp(self): - self.op_type = "complex" - self.python_api = paddle.complex - self.init_spec() - self.__class__.op_type = self.op_type - x = np.random.randn(*self.x_shape).astype(np.float32) - y = np.random.randn(*self.y_shape).astype(np.float32) - out_ref = ref_complex(x, y).astype(np.float64) - self.out_grad = convert_float_to_uint16( - np.random.randn(*self.x_shape).astype(np.float64) - + 1j * np.random.randn(*self.y_shape).astype(np.float64) - ) - self.inputs = { - 'X': convert_float_to_uint16(x), - 'Y': convert_float_to_uint16(y), - } - self.outputs = {'Out': convert_float_to_uint16(out_ref)} - - def test_check_output(self): - place = core.CUDAPlace(0) - self.check_output_with_place(place, check_eager=True, atol=1e-3) - - def test_check_grad(self): - place = core.CUDAPlace(0) - dout = self.out_grad - dx, dy = ref_complex_grad( - self.inputs['X'], self.inputs['Y'], self.out_grad - ) - self.check_grad( - place, - ['X', 'Y'], - 'Out', - user_defined_grads=[dx, dy], - user_defined_grad_outputs=[dout], - check_eager=True, - max_relative_error=1e-2, - ) - - if __name__ == "__main__": unittest.main() From 7084a8b318589684c401db04e7f34c1a3b698f82 Mon Sep 17 00:00:00 2001 From: loneranger <836253168@qq.com> Date: Mon, 13 Mar 2023 21:32:16 +0800 Subject: [PATCH 04/10] fix bug --- paddle/phi/kernels/complex_kernel.h | 1 - paddle/phi/kernels/gpu/complex_grad_kernel.cu | 1 + paddle/phi/kernels/gpu/complex_kernel.cu | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index ac33e32826daa..ad66b890b3d5a 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -15,7 +15,6 @@ limitations under the License. */ #pragma once #include "paddle/phi/common/complex.h" - #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu index cb9f527bcae13..b2a6e4117c075 100644 --- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/complex_grad_kernel.h" + #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h" diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index 2ea178f102297..5c5bf104128d3 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/complex_kernel.h" + #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_kernel_impl.h" From 1147ebe90d0041c7eda7b053304f612f64d30cfd Mon Sep 17 00:00:00 2001 From: loneranger <836253168@qq.com> Date: Mon, 13 Mar 2023 22:54:55 +0800 Subject: [PATCH 05/10] add fp16 and bf16 for conj --- paddle/phi/kernels/complex_kernel.h | 1 + paddle/phi/kernels/gpu/complex_kernel.cu | 3 +- .../fluid/tests/unittests/test_complex_op.py | 299 +++++++++++------- 3 files changed, 186 insertions(+), 117 deletions(-) diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index ad66b890b3d5a..5c9cf9c408237 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/phi/common/complex.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index 5c5bf104128d3..9ebf56c2a5feb 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/complex_kernel.h" - #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_kernel_impl.h" @@ -26,6 +26,7 @@ PD_REGISTER_KERNEL(conj, ALL_LAYOUT, phi::ConjKernel, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex, float, diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py index c769a85569820..1c66e8f16fbdb 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_op.py +++ b/python/paddle/fluid/tests/unittests/test_complex_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,154 +12,221 @@ # See the License for the specific language governing permissions and # limitations under the License. +import sys import unittest import numpy as np -from op_test import OpTest import paddle -from paddle import static -from paddle.fluid import dygraph -paddle.enable_static() - - -def ref_complex(x, y): - return x + 1j * y - - -def ref_complex_grad(x, y, dout): - out = x + 1j * y - out_rank = out.ndim - delta_rank_x = out_rank - x.ndim - delta_rank_y = out_rank - y.ndim +sys.path.append("..") +from numpy.random import random as rand +from op_test import OpTest, convert_float_to_uint16 - dx_reduce_axes = [] - dy_reduce_axes = [] - - for i in range(out_rank): - if i < delta_rank_x or dout.shape[i] > x.shape[i - delta_rank_x]: - dx_reduce_axes.append(i) - if i < delta_rank_y or dout.shape[i] > y.shape[i - delta_rank_y]: - dy_reduce_axes.append(i) - dx = np.sum(dout.real, axis=tuple(dx_reduce_axes)).reshape(x.shape) - dy = np.sum(dout.imag, axis=tuple(dy_reduce_axes)).reshape(y.shape) - return (dx, dy) +import paddle.fluid.core as core +import paddle.fluid.dygraph as dg +import paddle.static as static +paddle.enable_static() -class TestComplexOp(OpTest): - def init_spec(self): - self.x_shape = [10, 10] - self.y_shape = [10, 10] - self.dtype = "float64" +class TestConjOp(OpTest): def setUp(self): - self.op_type = "complex" - self.python_api = paddle.complex - self.init_spec() - x = np.random.randn(*self.x_shape).astype(self.dtype) - y = np.random.randn(*self.y_shape).astype(self.dtype) - out_ref = ref_complex(x, y) - self.out_grad = np.random.randn(*self.x_shape).astype( + self.op_type = "conj" + self.python_api = paddle.tensor.conj + self.init_dtype_type() + self.init_input_output() + self.init_grad_input_output() + + def init_dtype_type(self): + self.dtype = np.complex64 + + def init_input_output(self): + x = ( + np.random.random((12, 14)) + 1j * np.random.random((12, 14)) + ).astype(self.dtype) + out = np.conj(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def init_grad_input_output(self): + self.grad_out = (np.ones((12, 14)) + 1j * np.ones((12, 14))).astype( self.dtype - ) + 1j * np.random.randn(*self.y_shape).astype(self.dtype) - self.inputs = {'X': x, 'Y': y} - self.outputs = {'Out': out_ref} + ) + self.grad_in = np.conj(self.grad_out) def test_check_output(self): self.check_output(check_eager=True) - def test_check_grad(self): - dout = self.out_grad - dx, dy = ref_complex_grad( - self.inputs['X'], self.inputs['Y'], self.out_grad - ) + def test_check_grad_normal(self): self.check_grad( - ['X', 'Y'], + ['X'], 'Out', - user_defined_grads=[dx, dy], - user_defined_grad_outputs=[dout], + user_defined_grads=[self.grad_in], + user_defined_grad_outputs=[self.grad_out], check_eager=True, ) - def test_check_grad_ignore_x(self): - dout = self.out_grad - dx, dy = ref_complex_grad( - self.inputs['X'], self.inputs['Y'], self.out_grad - ) - self.assertTupleEqual(dx.shape, tuple(self.x_shape)) - self.assertTupleEqual(dy.shape, tuple(self.y_shape)) - self.check_grad( - ['Y'], - 'Out', - no_grad_set=set('X'), - user_defined_grads=[dy], - user_defined_grad_outputs=[dout], - check_eager=True, - ) - def test_check_grad_ignore_y(self): - dout = self.out_grad - dx, dy = ref_complex_grad( - self.inputs['X'], self.inputs['Y'], self.out_grad - ) - self.check_grad( +class TestComplexConjOp(unittest.TestCase): + def setUp(self): + self._dtypes = ["float32", "float64"] + self._places = [paddle.CPUPlace()] + if paddle.is_compiled_with_cuda(): + self._places.append(paddle.CUDAPlace(0)) + + def test_conj_api(self): + for dtype in self._dtypes: + input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand( + [2, 20, 2, 3] + ).astype(dtype) + for place in self._places: + with dg.guard(place): + var_x = paddle.to_tensor(input) + result = paddle.conj(var_x).numpy() + target = np.conj(input) + np.testing.assert_array_equal(result, target) + + def test_conj_operator(self): + for dtype in self._dtypes: + input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand( + [2, 20, 2, 3] + ).astype(dtype) + for place in self._places: + with dg.guard(place): + var_x = paddle.to_tensor(input) + result = var_x.conj().numpy() + target = np.conj(input) + np.testing.assert_array_equal(result, target) + + def test_conj_static_mode(self): + def init_input_output(dtype): + input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand( + [2, 20, 2, 3] + ).astype(dtype) + return {'x': input}, np.conj(input) + + for dtype in self._dtypes: + input_dict, np_res = init_input_output(dtype) + for place in self._places: + with static.program_guard(static.Program()): + x_dtype = ( + np.complex64 if dtype == "float32" else np.complex128 + ) + x = static.data( + name="x", shape=[2, 20, 2, 3], dtype=x_dtype + ) + out = paddle.conj(x) + + exe = static.Executor(place) + out_value = exe.run(feed=input_dict, fetch_list=[out.name]) + np.testing.assert_array_equal(np_res, out_value[0]) + + def test_conj_api_real_number(self): + for dtype in self._dtypes: + input = rand([2, 20, 2, 3]).astype(dtype) + for place in self._places: + with dg.guard(place): + var_x = paddle.to_tensor(input) + result = paddle.conj(var_x).numpy() + target = np.conj(input) + np.testing.assert_array_equal(result, target) + + +class Testfp16ConjOp(unittest.TestCase): + def testfp16(self): + input_x = ( + np.random.random((12, 14)) + 1j * np.random.random((12, 14)) + ).astype('float16') + with static.program_guard(static.Program()): + x = static.data(name="x", shape=[12, 14], dtype='float16') + out = paddle.conj(x) + if paddle.is_compiled_with_cuda(): + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + exe.run(paddle.static.default_startup_program()) + out = exe.run(feed={'x': input_x}, fetch_list=[out]) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the float16", +) +class TestConjFP16OP(OpTest): + def setUp(self): + self.op_type = "conj" + self.python_api = paddle.tensor.conj + self.__class__.op_type = self.op_type + self.init_dtype_type() + self.init_input_output() + # self.init_grad_input_output() + + def init_dtype_type(self): + self.dtype = np.float16 + + def init_input_output(self): + x = ( + np.random.random((12, 14)) + 1j * np.random.random((12, 14)) + ).astype(np.float32) + out = np.conj(x) + + self.inputs = {'X': x.astype(self.dtype)} + self.outputs = {'Out': out} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, ['X'], 'Out', - no_grad_set=set('Y'), - user_defined_grads=[dx], - user_defined_grad_outputs=[dout], - check_eager=True, + max_relative_error=1e-2, ) -class TestComplexOpBroadcast1(TestComplexOp): - def init_spec(self): - self.x_shape = [10, 3, 1, 4] - self.y_shape = [100, 1] - self.dtype = "float64" - +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestConjBF16(OpTest): + def setUp(self): + self.op_type = "conj" + self.python_api = paddle.tensor.conj + self.__class__.op_type = self.op_type + self.init_dtype_type() + self.init_input_output() + # self.init_grad_input_output() -class TestComplexOpBroadcast2(TestComplexOp): - def init_spec(self): - self.x_shape = [100, 1] - self.y_shape = [10, 3, 1, 4] - self.dtype = "float32" + def init_dtype_type(self): + self.dtype = np.uint16 + def init_input_output(self): + x = ( + np.random.random((12, 14)) + 1j * np.random.random((12, 14)) + ).astype(np.float32) + out = np.conj(x) -class TestComplexOpBroadcast3(TestComplexOp): - def init_spec(self): - self.x_shape = [1, 100] - self.y_shape = [100] - self.dtype = "float32" + self.inputs = {'X': convert_float_to_uint16(x)} + self.outputs = {'Out': convert_float_to_uint16(out)} + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-3) -class TestComplexAPI(unittest.TestCase): - def setUp(self): - self.x = np.random.randn(10, 10) - self.y = np.random.randn(10, 10) - self.out = ref_complex(self.x, self.y) - - def test_dygraph(self): - with dygraph.guard(): - x = paddle.to_tensor(self.x) - y = paddle.to_tensor(self.y) - out_np = paddle.complex(x, y).numpy() - np.testing.assert_allclose(self.out, out_np, rtol=1e-05) - - def test_static(self): - mp, sp = static.Program(), static.Program() - with static.program_guard(mp, sp): - x = static.data("x", shape=[10, 10], dtype="float64") - y = static.data("y", shape=[10, 10], dtype="float64") - out = paddle.complex(x, y) - - exe = static.Executor() - exe.run(sp) - [out_np] = exe.run( - mp, feed={"x": self.x, "y": self.y}, fetch_list=[out] + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['X'], + 'Out', + max_relative_error=1e-2, ) - np.testing.assert_allclose(self.out, out_np, rtol=1e-05) if __name__ == "__main__": From 04083a5fac068750e64656a6e2396becb58c51a9 Mon Sep 17 00:00:00 2001 From: loneranger <836253168@qq.com> Date: Mon, 13 Mar 2023 23:08:21 +0800 Subject: [PATCH 06/10] fix bug --- .../fluid/tests/unittests/test_complex_op.py | 299 +++++++----------- .../fluid/tests/unittests/test_conj_op.py | 83 ++++- 2 files changed, 198 insertions(+), 184 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py index 1c66e8f16fbdb..c769a85569820 100644 --- a/python/paddle/fluid/tests/unittests/test_complex_op.py +++ b/python/paddle/fluid/tests/unittests/test_complex_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,221 +12,154 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys import unittest import numpy as np +from op_test import OpTest import paddle +from paddle import static +from paddle.fluid import dygraph -sys.path.append("..") -from numpy.random import random as rand -from op_test import OpTest, convert_float_to_uint16 +paddle.enable_static() -import paddle.fluid.core as core -import paddle.fluid.dygraph as dg -import paddle.static as static -paddle.enable_static() +def ref_complex(x, y): + return x + 1j * y + + +def ref_complex_grad(x, y, dout): + out = x + 1j * y + out_rank = out.ndim + delta_rank_x = out_rank - x.ndim + delta_rank_y = out_rank - y.ndim + + dx_reduce_axes = [] + dy_reduce_axes = [] + for i in range(out_rank): + if i < delta_rank_x or dout.shape[i] > x.shape[i - delta_rank_x]: + dx_reduce_axes.append(i) + if i < delta_rank_y or dout.shape[i] > y.shape[i - delta_rank_y]: + dy_reduce_axes.append(i) + dx = np.sum(dout.real, axis=tuple(dx_reduce_axes)).reshape(x.shape) + dy = np.sum(dout.imag, axis=tuple(dy_reduce_axes)).reshape(y.shape) + return (dx, dy) + + +class TestComplexOp(OpTest): + def init_spec(self): + self.x_shape = [10, 10] + self.y_shape = [10, 10] + self.dtype = "float64" -class TestConjOp(OpTest): def setUp(self): - self.op_type = "conj" - self.python_api = paddle.tensor.conj - self.init_dtype_type() - self.init_input_output() - self.init_grad_input_output() - - def init_dtype_type(self): - self.dtype = np.complex64 - - def init_input_output(self): - x = ( - np.random.random((12, 14)) + 1j * np.random.random((12, 14)) - ).astype(self.dtype) - out = np.conj(x) - - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} - self.outputs = {'Out': out} - - def init_grad_input_output(self): - self.grad_out = (np.ones((12, 14)) + 1j * np.ones((12, 14))).astype( + self.op_type = "complex" + self.python_api = paddle.complex + self.init_spec() + x = np.random.randn(*self.x_shape).astype(self.dtype) + y = np.random.randn(*self.y_shape).astype(self.dtype) + out_ref = ref_complex(x, y) + self.out_grad = np.random.randn(*self.x_shape).astype( self.dtype - ) - self.grad_in = np.conj(self.grad_out) + ) + 1j * np.random.randn(*self.y_shape).astype(self.dtype) + self.inputs = {'X': x, 'Y': y} + self.outputs = {'Out': out_ref} def test_check_output(self): self.check_output(check_eager=True) - def test_check_grad_normal(self): + def test_check_grad(self): + dout = self.out_grad + dx, dy = ref_complex_grad( + self.inputs['X'], self.inputs['Y'], self.out_grad + ) self.check_grad( - ['X'], + ['X', 'Y'], 'Out', - user_defined_grads=[self.grad_in], - user_defined_grad_outputs=[self.grad_out], + user_defined_grads=[dx, dy], + user_defined_grad_outputs=[dout], check_eager=True, ) + def test_check_grad_ignore_x(self): + dout = self.out_grad + dx, dy = ref_complex_grad( + self.inputs['X'], self.inputs['Y'], self.out_grad + ) + self.assertTupleEqual(dx.shape, tuple(self.x_shape)) + self.assertTupleEqual(dy.shape, tuple(self.y_shape)) + self.check_grad( + ['Y'], + 'Out', + no_grad_set=set('X'), + user_defined_grads=[dy], + user_defined_grad_outputs=[dout], + check_eager=True, + ) -class TestComplexConjOp(unittest.TestCase): - def setUp(self): - self._dtypes = ["float32", "float64"] - self._places = [paddle.CPUPlace()] - if paddle.is_compiled_with_cuda(): - self._places.append(paddle.CUDAPlace(0)) - - def test_conj_api(self): - for dtype in self._dtypes: - input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand( - [2, 20, 2, 3] - ).astype(dtype) - for place in self._places: - with dg.guard(place): - var_x = paddle.to_tensor(input) - result = paddle.conj(var_x).numpy() - target = np.conj(input) - np.testing.assert_array_equal(result, target) - - def test_conj_operator(self): - for dtype in self._dtypes: - input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand( - [2, 20, 2, 3] - ).astype(dtype) - for place in self._places: - with dg.guard(place): - var_x = paddle.to_tensor(input) - result = var_x.conj().numpy() - target = np.conj(input) - np.testing.assert_array_equal(result, target) - - def test_conj_static_mode(self): - def init_input_output(dtype): - input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand( - [2, 20, 2, 3] - ).astype(dtype) - return {'x': input}, np.conj(input) - - for dtype in self._dtypes: - input_dict, np_res = init_input_output(dtype) - for place in self._places: - with static.program_guard(static.Program()): - x_dtype = ( - np.complex64 if dtype == "float32" else np.complex128 - ) - x = static.data( - name="x", shape=[2, 20, 2, 3], dtype=x_dtype - ) - out = paddle.conj(x) - - exe = static.Executor(place) - out_value = exe.run(feed=input_dict, fetch_list=[out.name]) - np.testing.assert_array_equal(np_res, out_value[0]) - - def test_conj_api_real_number(self): - for dtype in self._dtypes: - input = rand([2, 20, 2, 3]).astype(dtype) - for place in self._places: - with dg.guard(place): - var_x = paddle.to_tensor(input) - result = paddle.conj(var_x).numpy() - target = np.conj(input) - np.testing.assert_array_equal(result, target) - - -class Testfp16ConjOp(unittest.TestCase): - def testfp16(self): - input_x = ( - np.random.random((12, 14)) + 1j * np.random.random((12, 14)) - ).astype('float16') - with static.program_guard(static.Program()): - x = static.data(name="x", shape=[12, 14], dtype='float16') - out = paddle.conj(x) - if paddle.is_compiled_with_cuda(): - place = paddle.CUDAPlace(0) - exe = paddle.static.Executor(place) - exe.run(paddle.static.default_startup_program()) - out = exe.run(feed={'x': input_x}, fetch_list=[out]) - - -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_float16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the float16", -) -class TestConjFP16OP(OpTest): - def setUp(self): - self.op_type = "conj" - self.python_api = paddle.tensor.conj - self.__class__.op_type = self.op_type - self.init_dtype_type() - self.init_input_output() - # self.init_grad_input_output() - - def init_dtype_type(self): - self.dtype = np.float16 - - def init_input_output(self): - x = ( - np.random.random((12, 14)) + 1j * np.random.random((12, 14)) - ).astype(np.float32) - out = np.conj(x) - - self.inputs = {'X': x.astype(self.dtype)} - self.outputs = {'Out': out} - - def test_check_output(self): - place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-3) - - def test_check_grad(self): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, + def test_check_grad_ignore_y(self): + dout = self.out_grad + dx, dy = ref_complex_grad( + self.inputs['X'], self.inputs['Y'], self.out_grad + ) + self.check_grad( ['X'], 'Out', - max_relative_error=1e-2, + no_grad_set=set('Y'), + user_defined_grads=[dx], + user_defined_grad_outputs=[dout], + check_eager=True, ) -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_bfloat16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the bfloat16", -) -class TestConjBF16(OpTest): - def setUp(self): - self.op_type = "conj" - self.python_api = paddle.tensor.conj - self.__class__.op_type = self.op_type - self.init_dtype_type() - self.init_input_output() - # self.init_grad_input_output() +class TestComplexOpBroadcast1(TestComplexOp): + def init_spec(self): + self.x_shape = [10, 3, 1, 4] + self.y_shape = [100, 1] + self.dtype = "float64" - def init_dtype_type(self): - self.dtype = np.uint16 - def init_input_output(self): - x = ( - np.random.random((12, 14)) + 1j * np.random.random((12, 14)) - ).astype(np.float32) - out = np.conj(x) +class TestComplexOpBroadcast2(TestComplexOp): + def init_spec(self): + self.x_shape = [100, 1] + self.y_shape = [10, 3, 1, 4] + self.dtype = "float32" - self.inputs = {'X': convert_float_to_uint16(x)} - self.outputs = {'Out': convert_float_to_uint16(out)} - def test_check_output(self): - place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-3) +class TestComplexOpBroadcast3(TestComplexOp): + def init_spec(self): + self.x_shape = [1, 100] + self.y_shape = [100] + self.dtype = "float32" - def test_check_grad(self): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - ['X'], - 'Out', - max_relative_error=1e-2, + +class TestComplexAPI(unittest.TestCase): + def setUp(self): + self.x = np.random.randn(10, 10) + self.y = np.random.randn(10, 10) + self.out = ref_complex(self.x, self.y) + + def test_dygraph(self): + with dygraph.guard(): + x = paddle.to_tensor(self.x) + y = paddle.to_tensor(self.y) + out_np = paddle.complex(x, y).numpy() + np.testing.assert_allclose(self.out, out_np, rtol=1e-05) + + def test_static(self): + mp, sp = static.Program(), static.Program() + with static.program_guard(mp, sp): + x = static.data("x", shape=[10, 10], dtype="float64") + y = static.data("y", shape=[10, 10], dtype="float64") + out = paddle.complex(x, y) + + exe = static.Executor() + exe.run(sp) + [out_np] = exe.run( + mp, feed={"x": self.x, "y": self.y}, fetch_list=[out] ) + np.testing.assert_allclose(self.out, out_np, rtol=1e-05) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py index b3c24ba5f0017..1c66e8f16fbdb 100644 --- a/python/paddle/fluid/tests/unittests/test_conj_op.py +++ b/python/paddle/fluid/tests/unittests/test_conj_op.py @@ -21,8 +21,9 @@ sys.path.append("..") from numpy.random import random as rand -from op_test import OpTest +from op_test import OpTest, convert_float_to_uint16 +import paddle.fluid.core as core import paddle.fluid.dygraph as dg import paddle.static as static @@ -148,5 +149,85 @@ def testfp16(self): out = exe.run(feed={'x': input_x}, fetch_list=[out]) +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_float16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the float16", +) +class TestConjFP16OP(OpTest): + def setUp(self): + self.op_type = "conj" + self.python_api = paddle.tensor.conj + self.__class__.op_type = self.op_type + self.init_dtype_type() + self.init_input_output() + # self.init_grad_input_output() + + def init_dtype_type(self): + self.dtype = np.float16 + + def init_input_output(self): + x = ( + np.random.random((12, 14)) + 1j * np.random.random((12, 14)) + ).astype(np.float32) + out = np.conj(x) + + self.inputs = {'X': x.astype(self.dtype)} + self.outputs = {'Out': out} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['X'], + 'Out', + max_relative_error=1e-2, + ) + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestConjBF16(OpTest): + def setUp(self): + self.op_type = "conj" + self.python_api = paddle.tensor.conj + self.__class__.op_type = self.op_type + self.init_dtype_type() + self.init_input_output() + # self.init_grad_input_output() + + def init_dtype_type(self): + self.dtype = np.uint16 + + def init_input_output(self): + x = ( + np.random.random((12, 14)) + 1j * np.random.random((12, 14)) + ).astype(np.float32) + out = np.conj(x) + + self.inputs = {'X': convert_float_to_uint16(x)} + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad(self): + place = core.CUDAPlace(0) + self.check_grad_with_place( + place, + ['X'], + 'Out', + max_relative_error=1e-2, + ) + + if __name__ == "__main__": unittest.main() From 737a8dde12acc444b650b606fb108a8f8cfecbcb Mon Sep 17 00:00:00 2001 From: longranger2 <836253168@qq.com> Date: Sun, 19 Mar 2023 23:59:08 +0800 Subject: [PATCH 07/10] fix bug --- paddle/phi/kernels/gpu/complex_kernel.cu | 2 +- .../kernels/gpu/temporal_shift_grad_kernel.cu | 1 - .../phi/kernels/gpu/temporal_shift_kernel.cu | 1 - .../fluid/tests/unittests/test_conj_op.py | 49 +----- .../tests/unittests/test_temporal_shift_op.py | 149 +++++++----------- 5 files changed, 65 insertions(+), 137 deletions(-) diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu index 9ebf56c2a5feb..3b26984d87dde 100644 --- a/paddle/phi/kernels/gpu/complex_kernel.cu +++ b/paddle/phi/kernels/gpu/complex_kernel.cu @@ -13,8 +13,8 @@ // limitations under the License. #include "paddle/phi/kernels/complex_kernel.h" + #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/data_type.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/complex_kernel_impl.h" diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu index ec20e0b523a13..b50fad637d106 100644 --- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu @@ -15,7 +15,6 @@ #include "paddle/phi/kernels/temporal_shift_grad_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu index d83713f064f0a..4904da296488f 100644 --- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu +++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu @@ -15,7 +15,6 @@ #include "paddle/phi/kernels/temporal_shift_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" -#include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py index d2a7bcdad776e..e356743b179db 100644 --- a/python/paddle/fluid/tests/unittests/test_conj_op.py +++ b/python/paddle/fluid/tests/unittests/test_conj_op.py @@ -23,7 +23,6 @@ from eager_op_test import OpTest, convert_float_to_uint16 from numpy.random import random as rand - import paddle.fluid.core as core import paddle.fluid.dygraph as dg import paddle.static as static @@ -149,45 +148,10 @@ def testfp16(self): out = exe.run(feed={'x': input_x}, fetch_list=[out]) -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_float16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the float16", -) -class TestConjFP16OP(OpTest): - def setUp(self): - self.op_type = "conj" - self.python_api = paddle.tensor.conj - self.__class__.op_type = self.op_type - self.init_dtype_type() - self.init_input_output() - # self.init_grad_input_output() - +class TestConjFP16OP(TestConjOp): def init_dtype_type(self): self.dtype = np.float16 - def init_input_output(self): - x = ( - np.random.random((12, 14)) + 1j * np.random.random((12, 14)) - ).astype(np.float32) - out = np.conj(x) - - self.inputs = {'X': x.astype(self.dtype)} - self.outputs = {'Out': out} - - def test_check_output(self): - place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-3) - - def test_check_grad(self): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - ['X'], - 'Out', - max_relative_error=1e-2, - ) - @unittest.skipIf( not core.is_compiled_with_cuda() @@ -198,10 +162,8 @@ class TestConjBF16(OpTest): def setUp(self): self.op_type = "conj" self.python_api = paddle.tensor.conj - self.__class__.op_type = self.op_type self.init_dtype_type() self.init_input_output() - # self.init_grad_input_output() def init_dtype_type(self): self.dtype = np.uint16 @@ -217,16 +179,11 @@ def init_input_output(self): def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-3) + self.check_output_with_place(place) def test_check_grad(self): place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - ['X'], - 'Out', - max_relative_error=1e-2, - ) + self.check_grad_with_place(place, ['X'], 'Out') if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py index 62591655a4508..64f3ac0169f2c 100644 --- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py +++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py @@ -44,6 +44,7 @@ def temporal_shift(x, seg_num, shift_ratio, data_format): class TestTemporalShift(OpTest): def setUp(self): self.initTestCase() + self.init_dtype() self.op_type = 'temporal_shift' self.python_api = paddle.nn.functional.temporal_shift x = np.random.random(self.x_shape).astype(self.dtype) @@ -64,6 +65,9 @@ def setUp(self): self.outputs = {"Out": output} self.python_out_sig = ["Out"] + def init_dtype(self): + self.dtype = 'float64' + def test_check_output(self): self.check_output(check_eager=True) @@ -74,7 +78,6 @@ def initTestCase(self): self.x_shape = (6, 4, 4, 4) self.seg_num = 3 self.shift_ratio = 0.25 - self.dtype = 'float64' self.data_format = 'NCHW' @@ -103,105 +106,25 @@ def initTestCase(self): @unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_float16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the float16", -) -class TestTemporalShiftFP16OP(OpTest): - def initTestCase(self): - self.x_shape = (3, 10, 5, 5) - self.seg_num = 1 - self.shift_ratio = 0.3 - self.dtype = np.float16 - self.data_format = 'NCHW' - - def setUp(self): - self.initTestCase() - self.op_type = 'temporal_shift' - self.python_api = paddle.nn.functional.temporal_shift - self.__class__.op_type = self.op_type - x = np.random.random(self.x_shape).astype(np.float32) - - self.attrs = { - "seg_num": self.seg_num, - "shift_ratio": self.shift_ratio, - "data_format": self.data_format, - } - - self.inputs = { - "X": x.astype(self.dtype), - } - - output = temporal_shift( - x, self.seg_num, self.shift_ratio, self.data_format - ) - self.outputs = {"Out": output} - self.python_out_sig = ["Out"] - - def test_check_output(self): - place = core.CUDAPlace(0) - self.check_output_with_place(place, check_eager=True, atol=1e-3) - - def test_check_grad_ignore_uv(self): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - ['X'], - 'Out', - check_eager=True, - max_relative_error=1e-2, - ) - - -@unittest.skipIf( - not core.is_compiled_with_cuda() - or not core.is_bfloat16_supported(core.CUDAPlace(0)), - "core is not complied with CUDA and not support the bfloat16", + not core.is_compiled_with_cuda(), "core is not compiled with CUDA" ) -class TestTemporalShiftBF16(OpTest): +class TestTemporalShiftFP16(TestTemporalShift): def initTestCase(self): self.x_shape = (3, 10, 5, 5) self.seg_num = 1 self.shift_ratio = 0.3 - self.dtype = np.uint16 + self.dtype = 'float16' self.data_format = 'NCHW' - def setUp(self): - self.initTestCase() - self.op_type = 'temporal_shift' - self.python_api = paddle.nn.functional.temporal_shift - self.__class__.op_type = self.op_type - x = np.random.random(self.x_shape).astype(np.float32) - - self.attrs = { - "seg_num": self.seg_num, - "shift_ratio": self.shift_ratio, - "data_format": self.data_format, - } - - self.inputs = { - "X": convert_float_to_uint16(x), - } - - output = temporal_shift( - x, self.seg_num, self.shift_ratio, self.data_format - ) - self.outputs = {"Out": convert_float_to_uint16(output)} - self.python_out_sig = ["Out"] - def test_check_output(self): place = core.CUDAPlace(0) - self.check_output_with_place(place, check_eager=True, atol=1e-3) + if core.is_float16_supported(place): + self.check_output_with_place(place) def test_check_grad_ignore_uv(self): place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - ['X'], - 'Out', - check_eager=True, - max_relative_error=1e-2, - ) + if core.is_float16_supported(place): + self.check_grad_with_place(place, ['X'], 'Out') class TestTemporalShiftAPI(unittest.TestCase): @@ -254,6 +177,56 @@ def attr_data_format(): self.assertRaises(ValueError, attr_data_format) +class TestTemporalShiftFP16OP(TestTemporalShift): + def init_dtype(self): + self.dtype = np.float16 + + +@unittest.skipIf( + not core.is_compiled_with_cuda() + or not core.is_bfloat16_supported(core.CUDAPlace(0)), + "core is not complied with CUDA and not support the bfloat16", +) +class TestTemporalShiftBF16(OpTest): + def initTestCase(self): + self.x_shape = (3, 10, 5, 5) + self.seg_num = 1 + self.shift_ratio = 0.3 + self.dtype = np.uint16 + self.data_format = 'NCHW' + + def setUp(self): + self.initTestCase() + self.op_type = 'temporal_shift' + self.python_api = paddle.nn.functional.temporal_shift + + x = np.random.random(self.x_shape).astype(np.float32) + + self.attrs = { + "seg_num": self.seg_num, + "shift_ratio": self.shift_ratio, + "data_format": self.data_format, + } + + self.inputs = { + "X": convert_float_to_uint16(x), + } + + output = temporal_shift( + x, self.seg_num, self.shift_ratio, self.data_format + ) + self.outputs = {"Out": convert_float_to_uint16(output)} + self.python_out_sig = ["Out"] + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad_ignore_uv(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + if __name__ == "__main__": paddle.enable_static() unittest.main() From 0beacd2f22ee7657207df1118c094ffbb3883f1b Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Mon, 20 Mar 2023 09:00:47 +0800 Subject: [PATCH 08/10] Update complex_kernel.h fix bug --- paddle/phi/kernels/complex_kernel.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h index 5c9cf9c408237..ad66b890b3d5a 100644 --- a/paddle/phi/kernels/complex_kernel.h +++ b/paddle/phi/kernels/complex_kernel.h @@ -16,7 +16,6 @@ limitations under the License. */ #include "paddle/phi/common/complex.h" #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/empty_kernel.h" From 4bf323df168c6ebc4be2caba68f0b8321d51760a Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Mon, 20 Mar 2023 09:01:20 +0800 Subject: [PATCH 09/10] Update temporal_shift_grad_kernel.h fix bug --- paddle/phi/kernels/temporal_shift_grad_kernel.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/temporal_shift_grad_kernel.h b/paddle/phi/kernels/temporal_shift_grad_kernel.h index e91d08045ab88..1bcd3d61c26f5 100644 --- a/paddle/phi/kernels/temporal_shift_grad_kernel.h +++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" namespace phi { From ba8553ffcaf46d7697d744d47b9ccaeef521a677 Mon Sep 17 00:00:00 2001 From: LoneRanger <836253168@qq.com> Date: Mon, 20 Mar 2023 09:02:21 +0800 Subject: [PATCH 10/10] Update temporal_shift_kernel.h fix bug --- paddle/phi/kernels/temporal_shift_kernel.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/phi/kernels/temporal_shift_kernel.h b/paddle/phi/kernels/temporal_shift_kernel.h index 7c85ffd9783aa..a927d7fb23aae 100644 --- a/paddle/phi/kernels/temporal_shift_kernel.h +++ b/paddle/phi/kernels/temporal_shift_kernel.h @@ -15,7 +15,6 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" namespace phi {