From c8becdd9326d841e2bf69413d137d44cb3bbbb7a Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Sun, 12 Mar 2023 21:33:32 +0800
Subject: [PATCH 01/10] add fp16 and bfp16 for temporalshift

---
 .../kernels/gpu/temporal_shift_grad_kernel.cu |  4 +-
 .../phi/kernels/gpu/temporal_shift_kernel.cu  |  4 +-
 .../phi/kernels/temporal_shift_grad_kernel.h  |  1 +
 paddle/phi/kernels/temporal_shift_kernel.h    |  1 +
 .../tests/unittests/test_temporal_shift_op.py | 96 +++++++++++++++++--
 5 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
index cc5d95a12f7a3..ec20e0b523a13 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -146,4 +147,5 @@ PD_REGISTER_KERNEL(temporal_shift_grad,
                    phi::TemporalShiftGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
index b321fad07ac1f..d83713f064f0a 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/temporal_shift_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -146,4 +147,5 @@ PD_REGISTER_KERNEL(temporal_shift,
                    phi::TemporalShiftKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/temporal_shift_grad_kernel.h b/paddle/phi/kernels/temporal_shift_grad_kernel.h
index 1bcd3d61c26f5..e91d08045ab88 100644
--- a/paddle/phi/kernels/temporal_shift_grad_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/temporal_shift_kernel.h b/paddle/phi/kernels/temporal_shift_kernel.h
index a927d7fb23aae..7c85ffd9783aa 100644
--- a/paddle/phi/kernels/temporal_shift_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 6b99e0ead0886..30c07b1ca5464 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle.fluid import core
@@ -103,25 +103,105 @@ def initTestCase(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
 )
-class TestTemporalShiftFP16(TestTemporalShift):
+class TestTemporalShiftFP16OP(OpTest):
     def initTestCase(self):
         self.x_shape = (3, 10, 5, 5)
         self.seg_num = 1
         self.shift_ratio = 0.3
-        self.dtype = 'float16'
+        self.dtype = np.float16
         self.data_format = 'NCHW'
 
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'temporal_shift'
+        self.python_api = paddle.nn.functional.temporal_shift
+        self.__class__.op_type = self.op_type
+        x = np.random.random(self.x_shape).astype(np.float32)
+
+        self.attrs = {
+            "seg_num": self.seg_num,
+            "shift_ratio": self.shift_ratio,
+            "data_format": self.data_format,
+        }
+
+        self.inputs = {
+            "X": x.astype(self.dtype),
+        }
+
+        output = temporal_shift(
+            x, self.seg_num, self.shift_ratio, self.data_format
+        )
+        self.outputs = {"Out": output}
+        self.python_out_sig = ["Out"]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_eager=True, atol=1e-3)
+
+    def test_check_grad_ignore_uv(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+            check_eager=True,
+            max_relative_error=1e-2,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
+)
+class TestTemporalShiftBF16(OpTest):
+    def initTestCase(self):
+        self.x_shape = (3, 10, 5, 5)
+        self.seg_num = 1
+        self.shift_ratio = 0.3
+        self.dtype = np.uint16
+        self.data_format = 'NCHW'
+
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'temporal_shift'
+        self.python_api = paddle.nn.functional.temporal_shift
+        self.__class__.op_type = self.op_type
+        x = np.random.random(self.x_shape).astype(np.float32)
+
+        self.attrs = {
+            "seg_num": self.seg_num,
+            "shift_ratio": self.shift_ratio,
+            "data_format": self.data_format,
+        }
+
+        self.inputs = {
+            "X": convert_float_to_uint16(x),
+        }
+
+        output = temporal_shift(
+            x, self.seg_num, self.shift_ratio, self.data_format
+        )
+        self.outputs = {"Out": convert_float_to_uint16(output)}
+        self.python_out_sig = ["Out"]
+
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_output_with_place(place)
+        self.check_output_with_place(place, check_eager=True, atol=1e-3)
 
     def test_check_grad_ignore_uv(self):
         place = core.CUDAPlace(0)
-        if core.is_float16_supported(place):
-            self.check_grad_with_place(place, ['X'], 'Out')
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+            check_eager=True,
+            max_relative_error=1e-2,
+        )
 
 
 class TestTemporalShiftAPI(unittest.TestCase):

From 2207120533816d8697fb22f9ca25e0aa641f82d6 Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Mon, 13 Mar 2023 12:23:23 +0800
Subject: [PATCH 02/10] add fp16 and bfp16 for complex

---
 paddle/phi/kernels/complex_grad_kernel.h      |   1 +
 paddle/phi/kernels/complex_kernel.h           |   1 +
 paddle/phi/kernels/gpu/complex_grad_kernel.cu |  12 ++-
 paddle/phi/kernels/gpu/complex_kernel.cu      |  12 ++-
 .../fluid/tests/unittests/test_complex_op.py  | 100 +++++++++++++++++-
 .../tests/unittests/test_temporal_shift_op.py |   4 +-
 6 files changed, 120 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
index 91c47538e958d..393b39e562617 100644
--- a/paddle/phi/kernels/complex_grad_kernel.h
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index ad66b890b3d5a..47639ed430427 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
index b2a6e4117c075..0b722591bf3a0 100644
--- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_grad_kernel.h"
-
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
@@ -36,7 +36,13 @@ PD_REGISTER_KERNEL(real_grad,
   kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(
-    complex_grad, GPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) {
+PD_REGISTER_KERNEL(complex_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ComplexGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 5c5bf104128d3..4f6c609b8a44c 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
@@ -51,7 +51,13 @@ PD_REGISTER_KERNEL(imag,
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(
-    complex, GPU, ALL_LAYOUT, phi::ComplexKernel, float, double) {
+PD_REGISTER_KERNEL(complex,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ComplexKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py
index c769a85569820..8ed31fe99b6ac 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_op.py
@@ -15,11 +15,11 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
 import paddle
 from paddle import static
-from paddle.fluid import dygraph
+from paddle.fluid import core, dygraph
 
 paddle.enable_static()
 
@@ -162,5 +162,101 @@ def test_static(self):
         np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
+)
+class TestComplexFP16Op(OpTest):
+    def init_spec(self):
+        self.x_shape = [10, 10]
+        self.y_shape = [10, 10]
+        self.dtype = np.float16
+
+    def setUp(self):
+        self.op_type = "complex"
+        self.python_api = paddle.complex
+        self.init_spec()
+        self.__class__.op_type = self.op_type
+        x = np.random.randn(*self.x_shape).astype(np.float32)
+        y = np.random.randn(*self.y_shape).astype(np.float32)
+        out_ref = ref_complex(x, y).astype(np.float64)
+        self.out_grad = np.random.randn(*self.x_shape).astype(
+            np.float64
+        ) + 1j * np.random.randn(*self.y_shape).astype(np.float64)
+        self.inputs = {'X': x.astype(self.dtype), 'Y': y.astype(self.dtype)}
+        self.outputs = {'Out': out_ref}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_eager=True, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        dout = self.out_grad
+        dx, dy = ref_complex_grad(
+            self.inputs['X'], self.inputs['Y'], self.out_grad
+        )
+        self.check_grad(
+            place,
+            ['X', 'Y'],
+            'Out',
+            user_defined_grads=[dx, dy],
+            user_defined_grad_outputs=[dout],
+            check_eager=True,
+            max_relative_error=1e-2,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestComplexBF16(OpTest):
+    def init_spec(self):
+        self.x_shape = [10, 10]
+        self.y_shape = [10, 10]
+        self.dtype = np.uint16
+
+    def setUp(self):
+        self.op_type = "complex"
+        self.python_api = paddle.complex
+        self.init_spec()
+        self.__class__.op_type = self.op_type
+        x = np.random.randn(*self.x_shape).astype(np.float32)
+        y = np.random.randn(*self.y_shape).astype(np.float32)
+        out_ref = ref_complex(x, y).astype(np.float64)
+        self.out_grad = convert_float_to_uint16(
+            np.random.randn(*self.x_shape).astype(np.float64)
+            + 1j * np.random.randn(*self.y_shape).astype(np.float64)
+        )
+        self.inputs = {
+            'X': convert_float_to_uint16(x),
+            'Y': convert_float_to_uint16(y),
+        }
+        self.outputs = {'Out': convert_float_to_uint16(out_ref)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, check_eager=True, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        dout = self.out_grad
+        dx, dy = ref_complex_grad(
+            self.inputs['X'], self.inputs['Y'], self.out_grad
+        )
+        self.check_grad(
+            place,
+            ['X', 'Y'],
+            'Out',
+            user_defined_grads=[dx, dy],
+            user_defined_grad_outputs=[dout],
+            check_eager=True,
+            max_relative_error=1e-2,
+        )
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 30c07b1ca5464..62591655a4508 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -155,8 +155,8 @@ def test_check_grad_ignore_uv(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
-    or not core.is_float16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the float16",
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
 )
 class TestTemporalShiftBF16(OpTest):
     def initTestCase(self):

From 95edb7e7e5780ccaf2e228f95b20b978dcce1431 Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Mon, 13 Mar 2023 21:29:39 +0800
Subject: [PATCH 03/10] fix bug

---
 paddle/phi/kernels/complex_grad_kernel.h      |   1 -
 paddle/phi/kernels/complex_kernel.h           |   2 +-
 paddle/phi/kernels/gpu/complex_grad_kernel.cu |  11 +-
 paddle/phi/kernels/gpu/complex_kernel.cu      |  11 +-
 .../fluid/tests/unittests/test_complex_op.py  | 100 +-----------------
 5 files changed, 7 insertions(+), 118 deletions(-)

diff --git a/paddle/phi/kernels/complex_grad_kernel.h b/paddle/phi/kernels/complex_grad_kernel.h
index 393b39e562617..91c47538e958d 100644
--- a/paddle/phi/kernels/complex_grad_kernel.h
+++ b/paddle/phi/kernels/complex_grad_kernel.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 47639ed430427..ac33e32826daa 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/data_type.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
index 0b722591bf3a0..cb9f527bcae13 100644
--- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/complex_grad_kernel.h"
 #include "paddle/phi/common/complex.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
 
@@ -36,13 +35,7 @@ PD_REGISTER_KERNEL(real_grad,
   kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(complex_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ComplexGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {
+PD_REGISTER_KERNEL(
+    complex_grad, GPU, ALL_LAYOUT, phi::ComplexGradKernel, float, double) {
   kernel->InputAt(2).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 4f6c609b8a44c..2ea178f102297 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/kernels/complex_kernel.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
@@ -51,13 +50,7 @@ PD_REGISTER_KERNEL(imag,
   kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype()));
 }
 
-PD_REGISTER_KERNEL(complex,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::ComplexKernel,
-                   float,
-                   double,
-                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {
+PD_REGISTER_KERNEL(
+    complex, GPU, ALL_LAYOUT, phi::ComplexKernel, float, double) {
   kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py
index 8ed31fe99b6ac..c769a85569820 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_op.py
@@ -15,11 +15,11 @@
 import unittest
 
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest
 
 import paddle
 from paddle import static
-from paddle.fluid import core, dygraph
+from paddle.fluid import dygraph
 
 paddle.enable_static()
 
@@ -162,101 +162,5 @@ def test_static(self):
         np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_float16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the float16",
-)
-class TestComplexFP16Op(OpTest):
-    def init_spec(self):
-        self.x_shape = [10, 10]
-        self.y_shape = [10, 10]
-        self.dtype = np.float16
-
-    def setUp(self):
-        self.op_type = "complex"
-        self.python_api = paddle.complex
-        self.init_spec()
-        self.__class__.op_type = self.op_type
-        x = np.random.randn(*self.x_shape).astype(np.float32)
-        y = np.random.randn(*self.y_shape).astype(np.float32)
-        out_ref = ref_complex(x, y).astype(np.float64)
-        self.out_grad = np.random.randn(*self.x_shape).astype(
-            np.float64
-        ) + 1j * np.random.randn(*self.y_shape).astype(np.float64)
-        self.inputs = {'X': x.astype(self.dtype), 'Y': y.astype(self.dtype)}
-        self.outputs = {'Out': out_ref}
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=True, atol=1e-3)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        dout = self.out_grad
-        dx, dy = ref_complex_grad(
-            self.inputs['X'], self.inputs['Y'], self.out_grad
-        )
-        self.check_grad(
-            place,
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[dx, dy],
-            user_defined_grad_outputs=[dout],
-            check_eager=True,
-            max_relative_error=1e-2,
-        )
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the bfloat16",
-)
-class TestComplexBF16(OpTest):
-    def init_spec(self):
-        self.x_shape = [10, 10]
-        self.y_shape = [10, 10]
-        self.dtype = np.uint16
-
-    def setUp(self):
-        self.op_type = "complex"
-        self.python_api = paddle.complex
-        self.init_spec()
-        self.__class__.op_type = self.op_type
-        x = np.random.randn(*self.x_shape).astype(np.float32)
-        y = np.random.randn(*self.y_shape).astype(np.float32)
-        out_ref = ref_complex(x, y).astype(np.float64)
-        self.out_grad = convert_float_to_uint16(
-            np.random.randn(*self.x_shape).astype(np.float64)
-            + 1j * np.random.randn(*self.y_shape).astype(np.float64)
-        )
-        self.inputs = {
-            'X': convert_float_to_uint16(x),
-            'Y': convert_float_to_uint16(y),
-        }
-        self.outputs = {'Out': convert_float_to_uint16(out_ref)}
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=True, atol=1e-3)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        dout = self.out_grad
-        dx, dy = ref_complex_grad(
-            self.inputs['X'], self.inputs['Y'], self.out_grad
-        )
-        self.check_grad(
-            place,
-            ['X', 'Y'],
-            'Out',
-            user_defined_grads=[dx, dy],
-            user_defined_grad_outputs=[dout],
-            check_eager=True,
-            max_relative_error=1e-2,
-        )
-
-
 if __name__ == "__main__":
     unittest.main()

From 7084a8b318589684c401db04e7f34c1a3b698f82 Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Mon, 13 Mar 2023 21:32:16 +0800
Subject: [PATCH 04/10] fix bug

---
 paddle/phi/kernels/complex_kernel.h           | 1 -
 paddle/phi/kernels/gpu/complex_grad_kernel.cu | 1 +
 paddle/phi/kernels/gpu/complex_kernel.cu      | 1 +
 3 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index ac33e32826daa..ad66b890b3d5a 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/common/complex.h"
-
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
diff --git a/paddle/phi/kernels/gpu/complex_grad_kernel.cu b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
index cb9f527bcae13..b2a6e4117c075 100644
--- a/paddle/phi/kernels/gpu/complex_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_grad_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_grad_kernel.h"
+
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_grad_kernel_impl.h"
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 2ea178f102297..5c5bf104128d3 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_kernel.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_kernel_impl.h"

From 1147ebe90d0041c7eda7b053304f612f64d30cfd Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Mon, 13 Mar 2023 22:54:55 +0800
Subject: [PATCH 05/10] add fp16 and bf16 for conj

---
 paddle/phi/kernels/complex_kernel.h           |   1 +
 paddle/phi/kernels/gpu/complex_kernel.cu      |   3 +-
 .../fluid/tests/unittests/test_complex_op.py  | 299 +++++++++++-------
 3 files changed, 186 insertions(+), 117 deletions(-)

diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index ad66b890b3d5a..5c9cf9c408237 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 5c5bf104128d3..9ebf56c2a5feb 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_kernel.h"
-
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
@@ -26,6 +26,7 @@ PD_REGISTER_KERNEL(conj,
                    ALL_LAYOUT,
                    phi::ConjKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>,
                    float,
diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py
index c769a85569820..1c66e8f16fbdb 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,154 +12,221 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import unittest
 
 import numpy as np
-from op_test import OpTest
 
 import paddle
-from paddle import static
-from paddle.fluid import dygraph
 
-paddle.enable_static()
-
-
-def ref_complex(x, y):
-    return x + 1j * y
-
-
-def ref_complex_grad(x, y, dout):
-    out = x + 1j * y
-    out_rank = out.ndim
-    delta_rank_x = out_rank - x.ndim
-    delta_rank_y = out_rank - y.ndim
+sys.path.append("..")
+from numpy.random import random as rand
+from op_test import OpTest, convert_float_to_uint16
 
-    dx_reduce_axes = []
-    dy_reduce_axes = []
-
-    for i in range(out_rank):
-        if i < delta_rank_x or dout.shape[i] > x.shape[i - delta_rank_x]:
-            dx_reduce_axes.append(i)
-        if i < delta_rank_y or dout.shape[i] > y.shape[i - delta_rank_y]:
-            dy_reduce_axes.append(i)
-    dx = np.sum(dout.real, axis=tuple(dx_reduce_axes)).reshape(x.shape)
-    dy = np.sum(dout.imag, axis=tuple(dy_reduce_axes)).reshape(y.shape)
-    return (dx, dy)
+import paddle.fluid.core as core
+import paddle.fluid.dygraph as dg
+import paddle.static as static
 
+paddle.enable_static()
 
-class TestComplexOp(OpTest):
-    def init_spec(self):
-        self.x_shape = [10, 10]
-        self.y_shape = [10, 10]
-        self.dtype = "float64"
 
+class TestConjOp(OpTest):
     def setUp(self):
-        self.op_type = "complex"
-        self.python_api = paddle.complex
-        self.init_spec()
-        x = np.random.randn(*self.x_shape).astype(self.dtype)
-        y = np.random.randn(*self.y_shape).astype(self.dtype)
-        out_ref = ref_complex(x, y)
-        self.out_grad = np.random.randn(*self.x_shape).astype(
+        self.op_type = "conj"
+        self.python_api = paddle.tensor.conj
+        self.init_dtype_type()
+        self.init_input_output()
+        self.init_grad_input_output()
+
+    def init_dtype_type(self):
+        self.dtype = np.complex64
+
+    def init_input_output(self):
+        x = (
+            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
+        ).astype(self.dtype)
+        out = np.conj(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def init_grad_input_output(self):
+        self.grad_out = (np.ones((12, 14)) + 1j * np.ones((12, 14))).astype(
             self.dtype
-        ) + 1j * np.random.randn(*self.y_shape).astype(self.dtype)
-        self.inputs = {'X': x, 'Y': y}
-        self.outputs = {'Out': out_ref}
+        )
+        self.grad_in = np.conj(self.grad_out)
 
     def test_check_output(self):
         self.check_output(check_eager=True)
 
-    def test_check_grad(self):
-        dout = self.out_grad
-        dx, dy = ref_complex_grad(
-            self.inputs['X'], self.inputs['Y'], self.out_grad
-        )
+    def test_check_grad_normal(self):
         self.check_grad(
-            ['X', 'Y'],
+            ['X'],
             'Out',
-            user_defined_grads=[dx, dy],
-            user_defined_grad_outputs=[dout],
+            user_defined_grads=[self.grad_in],
+            user_defined_grad_outputs=[self.grad_out],
             check_eager=True,
         )
 
-    def test_check_grad_ignore_x(self):
-        dout = self.out_grad
-        dx, dy = ref_complex_grad(
-            self.inputs['X'], self.inputs['Y'], self.out_grad
-        )
-        self.assertTupleEqual(dx.shape, tuple(self.x_shape))
-        self.assertTupleEqual(dy.shape, tuple(self.y_shape))
-        self.check_grad(
-            ['Y'],
-            'Out',
-            no_grad_set=set('X'),
-            user_defined_grads=[dy],
-            user_defined_grad_outputs=[dout],
-            check_eager=True,
-        )
 
-    def test_check_grad_ignore_y(self):
-        dout = self.out_grad
-        dx, dy = ref_complex_grad(
-            self.inputs['X'], self.inputs['Y'], self.out_grad
-        )
-        self.check_grad(
+class TestComplexConjOp(unittest.TestCase):
+    def setUp(self):
+        self._dtypes = ["float32", "float64"]
+        self._places = [paddle.CPUPlace()]
+        if paddle.is_compiled_with_cuda():
+            self._places.append(paddle.CUDAPlace(0))
+
+    def test_conj_api(self):
+        for dtype in self._dtypes:
+            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
+                [2, 20, 2, 3]
+            ).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    var_x = paddle.to_tensor(input)
+                    result = paddle.conj(var_x).numpy()
+                    target = np.conj(input)
+                    np.testing.assert_array_equal(result, target)
+
+    def test_conj_operator(self):
+        for dtype in self._dtypes:
+            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
+                [2, 20, 2, 3]
+            ).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    var_x = paddle.to_tensor(input)
+                    result = var_x.conj().numpy()
+                    target = np.conj(input)
+                    np.testing.assert_array_equal(result, target)
+
+    def test_conj_static_mode(self):
+        def init_input_output(dtype):
+            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
+                [2, 20, 2, 3]
+            ).astype(dtype)
+            return {'x': input}, np.conj(input)
+
+        for dtype in self._dtypes:
+            input_dict, np_res = init_input_output(dtype)
+            for place in self._places:
+                with static.program_guard(static.Program()):
+                    x_dtype = (
+                        np.complex64 if dtype == "float32" else np.complex128
+                    )
+                    x = static.data(
+                        name="x", shape=[2, 20, 2, 3], dtype=x_dtype
+                    )
+                    out = paddle.conj(x)
+
+                    exe = static.Executor(place)
+                    out_value = exe.run(feed=input_dict, fetch_list=[out.name])
+                    np.testing.assert_array_equal(np_res, out_value[0])
+
+    def test_conj_api_real_number(self):
+        for dtype in self._dtypes:
+            input = rand([2, 20, 2, 3]).astype(dtype)
+            for place in self._places:
+                with dg.guard(place):
+                    var_x = paddle.to_tensor(input)
+                    result = paddle.conj(var_x).numpy()
+                    target = np.conj(input)
+                    np.testing.assert_array_equal(result, target)
+
+
+class Testfp16ConjOp(unittest.TestCase):
+    def testfp16(self):
+        input_x = (
+            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
+        ).astype('float16')
+        with static.program_guard(static.Program()):
+            x = static.data(name="x", shape=[12, 14], dtype='float16')
+            out = paddle.conj(x)
+            if paddle.is_compiled_with_cuda():
+                place = paddle.CUDAPlace(0)
+                exe = paddle.static.Executor(place)
+                exe.run(paddle.static.default_startup_program())
+                out = exe.run(feed={'x': input_x}, fetch_list=[out])
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
+)
+class TestConjFP16OP(OpTest):
+    def setUp(self):
+        self.op_type = "conj"
+        self.python_api = paddle.tensor.conj
+        self.__class__.op_type = self.op_type
+        self.init_dtype_type()
+        self.init_input_output()
+        # self.init_grad_input_output()
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def init_input_output(self):
+        x = (
+            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
+        ).astype(np.float32)
+        out = np.conj(x)
+
+        self.inputs = {'X': x.astype(self.dtype)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
             ['X'],
             'Out',
-            no_grad_set=set('Y'),
-            user_defined_grads=[dx],
-            user_defined_grad_outputs=[dout],
-            check_eager=True,
+            max_relative_error=1e-2,
         )
 
 
-class TestComplexOpBroadcast1(TestComplexOp):
-    def init_spec(self):
-        self.x_shape = [10, 3, 1, 4]
-        self.y_shape = [100, 1]
-        self.dtype = "float64"
-
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestConjBF16(OpTest):
+    def setUp(self):
+        self.op_type = "conj"
+        self.python_api = paddle.tensor.conj
+        self.__class__.op_type = self.op_type
+        self.init_dtype_type()
+        self.init_input_output()
+        # self.init_grad_input_output()
 
-class TestComplexOpBroadcast2(TestComplexOp):
-    def init_spec(self):
-        self.x_shape = [100, 1]
-        self.y_shape = [10, 3, 1, 4]
-        self.dtype = "float32"
+    def init_dtype_type(self):
+        self.dtype = np.uint16
 
+    def init_input_output(self):
+        x = (
+            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
+        ).astype(np.float32)
+        out = np.conj(x)
 
-class TestComplexOpBroadcast3(TestComplexOp):
-    def init_spec(self):
-        self.x_shape = [1, 100]
-        self.y_shape = [100]
-        self.dtype = "float32"
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
 
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
 
-class TestComplexAPI(unittest.TestCase):
-    def setUp(self):
-        self.x = np.random.randn(10, 10)
-        self.y = np.random.randn(10, 10)
-        self.out = ref_complex(self.x, self.y)
-
-    def test_dygraph(self):
-        with dygraph.guard():
-            x = paddle.to_tensor(self.x)
-            y = paddle.to_tensor(self.y)
-            out_np = paddle.complex(x, y).numpy()
-        np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
-
-    def test_static(self):
-        mp, sp = static.Program(), static.Program()
-        with static.program_guard(mp, sp):
-            x = static.data("x", shape=[10, 10], dtype="float64")
-            y = static.data("y", shape=[10, 10], dtype="float64")
-            out = paddle.complex(x, y)
-
-        exe = static.Executor()
-        exe.run(sp)
-        [out_np] = exe.run(
-            mp, feed={"x": self.x, "y": self.y}, fetch_list=[out]
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+            max_relative_error=1e-2,
         )
-        np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
 
 
 if __name__ == "__main__":

From 04083a5fac068750e64656a6e2396becb58c51a9 Mon Sep 17 00:00:00 2001
From: loneranger <836253168@qq.com>
Date: Mon, 13 Mar 2023 23:08:21 +0800
Subject: [PATCH 06/10] fix bug

---
 .../fluid/tests/unittests/test_complex_op.py  | 299 +++++++-----------
 .../fluid/tests/unittests/test_conj_op.py     |  83 ++++-
 2 files changed, 198 insertions(+), 184 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_complex_op.py b/python/paddle/fluid/tests/unittests/test_complex_op.py
index 1c66e8f16fbdb..c769a85569820 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_op.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,221 +12,154 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import sys
 import unittest
 
 import numpy as np
+from op_test import OpTest
 
 import paddle
+from paddle import static
+from paddle.fluid import dygraph
 
-sys.path.append("..")
-from numpy.random import random as rand
-from op_test import OpTest, convert_float_to_uint16
+paddle.enable_static()
 
-import paddle.fluid.core as core
-import paddle.fluid.dygraph as dg
-import paddle.static as static
 
-paddle.enable_static()
+def ref_complex(x, y):
+    return x + 1j * y
+
+
+def ref_complex_grad(x, y, dout):
+    out = x + 1j * y
+    out_rank = out.ndim
+    delta_rank_x = out_rank - x.ndim
+    delta_rank_y = out_rank - y.ndim
+
+    dx_reduce_axes = []
+    dy_reduce_axes = []
 
+    for i in range(out_rank):
+        if i < delta_rank_x or dout.shape[i] > x.shape[i - delta_rank_x]:
+            dx_reduce_axes.append(i)
+        if i < delta_rank_y or dout.shape[i] > y.shape[i - delta_rank_y]:
+            dy_reduce_axes.append(i)
+    dx = np.sum(dout.real, axis=tuple(dx_reduce_axes)).reshape(x.shape)
+    dy = np.sum(dout.imag, axis=tuple(dy_reduce_axes)).reshape(y.shape)
+    return (dx, dy)
+
+
+class TestComplexOp(OpTest):
+    def init_spec(self):
+        self.x_shape = [10, 10]
+        self.y_shape = [10, 10]
+        self.dtype = "float64"
 
-class TestConjOp(OpTest):
     def setUp(self):
-        self.op_type = "conj"
-        self.python_api = paddle.tensor.conj
-        self.init_dtype_type()
-        self.init_input_output()
-        self.init_grad_input_output()
-
-    def init_dtype_type(self):
-        self.dtype = np.complex64
-
-    def init_input_output(self):
-        x = (
-            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
-        ).astype(self.dtype)
-        out = np.conj(x)
-
-        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
-        self.outputs = {'Out': out}
-
-    def init_grad_input_output(self):
-        self.grad_out = (np.ones((12, 14)) + 1j * np.ones((12, 14))).astype(
+        self.op_type = "complex"
+        self.python_api = paddle.complex
+        self.init_spec()
+        x = np.random.randn(*self.x_shape).astype(self.dtype)
+        y = np.random.randn(*self.y_shape).astype(self.dtype)
+        out_ref = ref_complex(x, y)
+        self.out_grad = np.random.randn(*self.x_shape).astype(
             self.dtype
-        )
-        self.grad_in = np.conj(self.grad_out)
+        ) + 1j * np.random.randn(*self.y_shape).astype(self.dtype)
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': out_ref}
 
     def test_check_output(self):
         self.check_output(check_eager=True)
 
-    def test_check_grad_normal(self):
+    def test_check_grad(self):
+        dout = self.out_grad
+        dx, dy = ref_complex_grad(
+            self.inputs['X'], self.inputs['Y'], self.out_grad
+        )
         self.check_grad(
-            ['X'],
+            ['X', 'Y'],
             'Out',
-            user_defined_grads=[self.grad_in],
-            user_defined_grad_outputs=[self.grad_out],
+            user_defined_grads=[dx, dy],
+            user_defined_grad_outputs=[dout],
             check_eager=True,
         )
 
+    def test_check_grad_ignore_x(self):
+        dout = self.out_grad
+        dx, dy = ref_complex_grad(
+            self.inputs['X'], self.inputs['Y'], self.out_grad
+        )
+        self.assertTupleEqual(dx.shape, tuple(self.x_shape))
+        self.assertTupleEqual(dy.shape, tuple(self.y_shape))
+        self.check_grad(
+            ['Y'],
+            'Out',
+            no_grad_set=set('X'),
+            user_defined_grads=[dy],
+            user_defined_grad_outputs=[dout],
+            check_eager=True,
+        )
 
-class TestComplexConjOp(unittest.TestCase):
-    def setUp(self):
-        self._dtypes = ["float32", "float64"]
-        self._places = [paddle.CPUPlace()]
-        if paddle.is_compiled_with_cuda():
-            self._places.append(paddle.CUDAPlace(0))
-
-    def test_conj_api(self):
-        for dtype in self._dtypes:
-            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
-                [2, 20, 2, 3]
-            ).astype(dtype)
-            for place in self._places:
-                with dg.guard(place):
-                    var_x = paddle.to_tensor(input)
-                    result = paddle.conj(var_x).numpy()
-                    target = np.conj(input)
-                    np.testing.assert_array_equal(result, target)
-
-    def test_conj_operator(self):
-        for dtype in self._dtypes:
-            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
-                [2, 20, 2, 3]
-            ).astype(dtype)
-            for place in self._places:
-                with dg.guard(place):
-                    var_x = paddle.to_tensor(input)
-                    result = var_x.conj().numpy()
-                    target = np.conj(input)
-                    np.testing.assert_array_equal(result, target)
-
-    def test_conj_static_mode(self):
-        def init_input_output(dtype):
-            input = rand([2, 20, 2, 3]).astype(dtype) + 1j * rand(
-                [2, 20, 2, 3]
-            ).astype(dtype)
-            return {'x': input}, np.conj(input)
-
-        for dtype in self._dtypes:
-            input_dict, np_res = init_input_output(dtype)
-            for place in self._places:
-                with static.program_guard(static.Program()):
-                    x_dtype = (
-                        np.complex64 if dtype == "float32" else np.complex128
-                    )
-                    x = static.data(
-                        name="x", shape=[2, 20, 2, 3], dtype=x_dtype
-                    )
-                    out = paddle.conj(x)
-
-                    exe = static.Executor(place)
-                    out_value = exe.run(feed=input_dict, fetch_list=[out.name])
-                    np.testing.assert_array_equal(np_res, out_value[0])
-
-    def test_conj_api_real_number(self):
-        for dtype in self._dtypes:
-            input = rand([2, 20, 2, 3]).astype(dtype)
-            for place in self._places:
-                with dg.guard(place):
-                    var_x = paddle.to_tensor(input)
-                    result = paddle.conj(var_x).numpy()
-                    target = np.conj(input)
-                    np.testing.assert_array_equal(result, target)
-
-
-class Testfp16ConjOp(unittest.TestCase):
-    def testfp16(self):
-        input_x = (
-            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
-        ).astype('float16')
-        with static.program_guard(static.Program()):
-            x = static.data(name="x", shape=[12, 14], dtype='float16')
-            out = paddle.conj(x)
-            if paddle.is_compiled_with_cuda():
-                place = paddle.CUDAPlace(0)
-                exe = paddle.static.Executor(place)
-                exe.run(paddle.static.default_startup_program())
-                out = exe.run(feed={'x': input_x}, fetch_list=[out])
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_float16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the float16",
-)
-class TestConjFP16OP(OpTest):
-    def setUp(self):
-        self.op_type = "conj"
-        self.python_api = paddle.tensor.conj
-        self.__class__.op_type = self.op_type
-        self.init_dtype_type()
-        self.init_input_output()
-        # self.init_grad_input_output()
-
-    def init_dtype_type(self):
-        self.dtype = np.float16
-
-    def init_input_output(self):
-        x = (
-            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
-        ).astype(np.float32)
-        out = np.conj(x)
-
-        self.inputs = {'X': x.astype(self.dtype)}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
+    def test_check_grad_ignore_y(self):
+        dout = self.out_grad
+        dx, dy = ref_complex_grad(
+            self.inputs['X'], self.inputs['Y'], self.out_grad
+        )
+        self.check_grad(
             ['X'],
             'Out',
-            max_relative_error=1e-2,
+            no_grad_set=set('Y'),
+            user_defined_grads=[dx],
+            user_defined_grad_outputs=[dout],
+            check_eager=True,
         )
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the bfloat16",
-)
-class TestConjBF16(OpTest):
-    def setUp(self):
-        self.op_type = "conj"
-        self.python_api = paddle.tensor.conj
-        self.__class__.op_type = self.op_type
-        self.init_dtype_type()
-        self.init_input_output()
-        # self.init_grad_input_output()
+class TestComplexOpBroadcast1(TestComplexOp):
+    def init_spec(self):
+        self.x_shape = [10, 3, 1, 4]
+        self.y_shape = [100, 1]
+        self.dtype = "float64"
 
-    def init_dtype_type(self):
-        self.dtype = np.uint16
 
-    def init_input_output(self):
-        x = (
-            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
-        ).astype(np.float32)
-        out = np.conj(x)
+class TestComplexOpBroadcast2(TestComplexOp):
+    def init_spec(self):
+        self.x_shape = [100, 1]
+        self.y_shape = [10, 3, 1, 4]
+        self.dtype = "float32"
 
-        self.inputs = {'X': convert_float_to_uint16(x)}
-        self.outputs = {'Out': convert_float_to_uint16(out)}
 
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3)
+class TestComplexOpBroadcast3(TestComplexOp):
+    def init_spec(self):
+        self.x_shape = [1, 100]
+        self.y_shape = [100]
+        self.dtype = "float32"
 
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            ['X'],
-            'Out',
-            max_relative_error=1e-2,
+
+class TestComplexAPI(unittest.TestCase):
+    def setUp(self):
+        self.x = np.random.randn(10, 10)
+        self.y = np.random.randn(10, 10)
+        self.out = ref_complex(self.x, self.y)
+
+    def test_dygraph(self):
+        with dygraph.guard():
+            x = paddle.to_tensor(self.x)
+            y = paddle.to_tensor(self.y)
+            out_np = paddle.complex(x, y).numpy()
+        np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
+
+    def test_static(self):
+        mp, sp = static.Program(), static.Program()
+        with static.program_guard(mp, sp):
+            x = static.data("x", shape=[10, 10], dtype="float64")
+            y = static.data("y", shape=[10, 10], dtype="float64")
+            out = paddle.complex(x, y)
+
+        exe = static.Executor()
+        exe.run(sp)
+        [out_np] = exe.run(
+            mp, feed={"x": self.x, "y": self.y}, fetch_list=[out]
         )
+        np.testing.assert_allclose(self.out, out_np, rtol=1e-05)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py
index b3c24ba5f0017..1c66e8f16fbdb 100644
--- a/python/paddle/fluid/tests/unittests/test_conj_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conj_op.py
@@ -21,8 +21,9 @@
 
 sys.path.append("..")
 from numpy.random import random as rand
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 
+import paddle.fluid.core as core
 import paddle.fluid.dygraph as dg
 import paddle.static as static
 
@@ -148,5 +149,85 @@ def testfp16(self):
                 out = exe.run(feed={'x': input_x}, fetch_list=[out])
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_float16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the float16",
+)
+class TestConjFP16OP(OpTest):
+    def setUp(self):
+        self.op_type = "conj"
+        self.python_api = paddle.tensor.conj
+        self.__class__.op_type = self.op_type
+        self.init_dtype_type()
+        self.init_input_output()
+        # self.init_grad_input_output()
+
+    def init_dtype_type(self):
+        self.dtype = np.float16
+
+    def init_input_output(self):
+        x = (
+            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
+        ).astype(np.float32)
+        out = np.conj(x)
+
+        self.inputs = {'X': x.astype(self.dtype)}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+            max_relative_error=1e-2,
+        )
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestConjBF16(OpTest):
+    def setUp(self):
+        self.op_type = "conj"
+        self.python_api = paddle.tensor.conj
+        self.__class__.op_type = self.op_type
+        self.init_dtype_type()
+        self.init_input_output()
+        # self.init_grad_input_output()
+
+    def init_dtype_type(self):
+        self.dtype = np.uint16
+
+    def init_input_output(self):
+        x = (
+            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
+        ).astype(np.float32)
+        out = np.conj(x)
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place,
+            ['X'],
+            'Out',
+            max_relative_error=1e-2,
+        )
+
+
 if __name__ == "__main__":
     unittest.main()

From 737a8dde12acc444b650b606fb108a8f8cfecbcb Mon Sep 17 00:00:00 2001
From: longranger2 <836253168@qq.com>
Date: Sun, 19 Mar 2023 23:59:08 +0800
Subject: [PATCH 07/10] fix bug

---
 paddle/phi/kernels/gpu/complex_kernel.cu      |   2 +-
 .../kernels/gpu/temporal_shift_grad_kernel.cu |   1 -
 .../phi/kernels/gpu/temporal_shift_kernel.cu  |   1 -
 .../fluid/tests/unittests/test_conj_op.py     |  49 +-----
 .../tests/unittests/test_temporal_shift_op.py | 149 +++++++-----------
 5 files changed, 65 insertions(+), 137 deletions(-)

diff --git a/paddle/phi/kernels/gpu/complex_kernel.cu b/paddle/phi/kernels/gpu/complex_kernel.cu
index 9ebf56c2a5feb..3b26984d87dde 100644
--- a/paddle/phi/kernels/gpu/complex_kernel.cu
+++ b/paddle/phi/kernels/gpu/complex_kernel.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/complex_kernel.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/complex_kernel_impl.h"
 
diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
index ec20e0b523a13..b50fad637d106 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu
@@ -15,7 +15,6 @@
 #include "paddle/phi/kernels/temporal_shift_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
index d83713f064f0a..4904da296488f 100644
--- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
+++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu
@@ -15,7 +15,6 @@
 #include "paddle/phi/kernels/temporal_shift_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 
diff --git a/python/paddle/fluid/tests/unittests/test_conj_op.py b/python/paddle/fluid/tests/unittests/test_conj_op.py
index d2a7bcdad776e..e356743b179db 100644
--- a/python/paddle/fluid/tests/unittests/test_conj_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conj_op.py
@@ -23,7 +23,6 @@
 from eager_op_test import OpTest, convert_float_to_uint16
 from numpy.random import random as rand
 
-
 import paddle.fluid.core as core
 import paddle.fluid.dygraph as dg
 import paddle.static as static
@@ -149,45 +148,10 @@ def testfp16(self):
                 out = exe.run(feed={'x': input_x}, fetch_list=[out])
 
 
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_float16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the float16",
-)
-class TestConjFP16OP(OpTest):
-    def setUp(self):
-        self.op_type = "conj"
-        self.python_api = paddle.tensor.conj
-        self.__class__.op_type = self.op_type
-        self.init_dtype_type()
-        self.init_input_output()
-        # self.init_grad_input_output()
-
+class TestConjFP16OP(TestConjOp):
     def init_dtype_type(self):
         self.dtype = np.float16
 
-    def init_input_output(self):
-        x = (
-            np.random.random((12, 14)) + 1j * np.random.random((12, 14))
-        ).astype(np.float32)
-        out = np.conj(x)
-
-        self.inputs = {'X': x.astype(self.dtype)}
-        self.outputs = {'Out': out}
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3)
-
-    def test_check_grad(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            ['X'],
-            'Out',
-            max_relative_error=1e-2,
-        )
-
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
@@ -198,10 +162,8 @@ class TestConjBF16(OpTest):
     def setUp(self):
         self.op_type = "conj"
         self.python_api = paddle.tensor.conj
-        self.__class__.op_type = self.op_type
         self.init_dtype_type()
         self.init_input_output()
-        # self.init_grad_input_output()
 
     def init_dtype_type(self):
         self.dtype = np.uint16
@@ -217,16 +179,11 @@ def init_input_output(self):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3)
+        self.check_output_with_place(place)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            ['X'],
-            'Out',
-            max_relative_error=1e-2,
-        )
+        self.check_grad_with_place(place, ['X'], 'Out')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
index 62591655a4508..64f3ac0169f2c 100644
--- a/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_temporal_shift_op.py
@@ -44,6 +44,7 @@ def temporal_shift(x, seg_num, shift_ratio, data_format):
 class TestTemporalShift(OpTest):
     def setUp(self):
         self.initTestCase()
+        self.init_dtype()
         self.op_type = 'temporal_shift'
         self.python_api = paddle.nn.functional.temporal_shift
         x = np.random.random(self.x_shape).astype(self.dtype)
@@ -64,6 +65,9 @@ def setUp(self):
         self.outputs = {"Out": output}
         self.python_out_sig = ["Out"]
 
+    def init_dtype(self):
+        self.dtype = 'float64'
+
     def test_check_output(self):
         self.check_output(check_eager=True)
 
@@ -74,7 +78,6 @@ def initTestCase(self):
         self.x_shape = (6, 4, 4, 4)
         self.seg_num = 3
         self.shift_ratio = 0.25
-        self.dtype = 'float64'
         self.data_format = 'NCHW'
 
 
@@ -103,105 +106,25 @@ def initTestCase(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_float16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the float16",
-)
-class TestTemporalShiftFP16OP(OpTest):
-    def initTestCase(self):
-        self.x_shape = (3, 10, 5, 5)
-        self.seg_num = 1
-        self.shift_ratio = 0.3
-        self.dtype = np.float16
-        self.data_format = 'NCHW'
-
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'temporal_shift'
-        self.python_api = paddle.nn.functional.temporal_shift
-        self.__class__.op_type = self.op_type
-        x = np.random.random(self.x_shape).astype(np.float32)
-
-        self.attrs = {
-            "seg_num": self.seg_num,
-            "shift_ratio": self.shift_ratio,
-            "data_format": self.data_format,
-        }
-
-        self.inputs = {
-            "X": x.astype(self.dtype),
-        }
-
-        output = temporal_shift(
-            x, self.seg_num, self.shift_ratio, self.data_format
-        )
-        self.outputs = {"Out": output}
-        self.python_out_sig = ["Out"]
-
-    def test_check_output(self):
-        place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=True, atol=1e-3)
-
-    def test_check_grad_ignore_uv(self):
-        place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            ['X'],
-            'Out',
-            check_eager=True,
-            max_relative_error=1e-2,
-        )
-
-
-@unittest.skipIf(
-    not core.is_compiled_with_cuda()
-    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
-    "core is not complied with CUDA and not support the bfloat16",
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
 )
-class TestTemporalShiftBF16(OpTest):
+class TestTemporalShiftFP16(TestTemporalShift):
     def initTestCase(self):
         self.x_shape = (3, 10, 5, 5)
         self.seg_num = 1
         self.shift_ratio = 0.3
-        self.dtype = np.uint16
+        self.dtype = 'float16'
         self.data_format = 'NCHW'
 
-    def setUp(self):
-        self.initTestCase()
-        self.op_type = 'temporal_shift'
-        self.python_api = paddle.nn.functional.temporal_shift
-        self.__class__.op_type = self.op_type
-        x = np.random.random(self.x_shape).astype(np.float32)
-
-        self.attrs = {
-            "seg_num": self.seg_num,
-            "shift_ratio": self.shift_ratio,
-            "data_format": self.data_format,
-        }
-
-        self.inputs = {
-            "X": convert_float_to_uint16(x),
-        }
-
-        output = temporal_shift(
-            x, self.seg_num, self.shift_ratio, self.data_format
-        )
-        self.outputs = {"Out": convert_float_to_uint16(output)}
-        self.python_out_sig = ["Out"]
-
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place, check_eager=True, atol=1e-3)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place)
 
     def test_check_grad_ignore_uv(self):
         place = core.CUDAPlace(0)
-        self.check_grad_with_place(
-            place,
-            ['X'],
-            'Out',
-            check_eager=True,
-            max_relative_error=1e-2,
-        )
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(place, ['X'], 'Out')
 
 
 class TestTemporalShiftAPI(unittest.TestCase):
@@ -254,6 +177,56 @@ def attr_data_format():
         self.assertRaises(ValueError, attr_data_format)
 
 
+class TestTemporalShiftFP16OP(TestTemporalShift):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not complied with CUDA and not support the bfloat16",
+)
+class TestTemporalShiftBF16(OpTest):
+    def initTestCase(self):
+        self.x_shape = (3, 10, 5, 5)
+        self.seg_num = 1
+        self.shift_ratio = 0.3
+        self.dtype = np.uint16
+        self.data_format = 'NCHW'
+
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'temporal_shift'
+        self.python_api = paddle.nn.functional.temporal_shift
+
+        x = np.random.random(self.x_shape).astype(np.float32)
+
+        self.attrs = {
+            "seg_num": self.seg_num,
+            "shift_ratio": self.shift_ratio,
+            "data_format": self.data_format,
+        }
+
+        self.inputs = {
+            "X": convert_float_to_uint16(x),
+        }
+
+        output = temporal_shift(
+            x, self.seg_num, self.shift_ratio, self.data_format
+        )
+        self.outputs = {"Out": convert_float_to_uint16(output)}
+        self.python_out_sig = ["Out"]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad_ignore_uv(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(place, ['X'], 'Out')
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()

From 0beacd2f22ee7657207df1118c094ffbb3883f1b Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Mon, 20 Mar 2023 09:00:47 +0800
Subject: [PATCH 08/10] Update complex_kernel.h

fix bug
---
 paddle/phi/kernels/complex_kernel.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/phi/kernels/complex_kernel.h b/paddle/phi/kernels/complex_kernel.h
index 5c9cf9c408237..ad66b890b3d5a 100644
--- a/paddle/phi/kernels/complex_kernel.h
+++ b/paddle/phi/kernels/complex_kernel.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 

From 4bf323df168c6ebc4be2caba68f0b8321d51760a Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Mon, 20 Mar 2023 09:01:20 +0800
Subject: [PATCH 09/10] Update temporal_shift_grad_kernel.h

fix bug
---
 paddle/phi/kernels/temporal_shift_grad_kernel.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/phi/kernels/temporal_shift_grad_kernel.h b/paddle/phi/kernels/temporal_shift_grad_kernel.h
index e91d08045ab88..1bcd3d61c26f5 100644
--- a/paddle/phi/kernels/temporal_shift_grad_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_grad_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 

From ba8553ffcaf46d7697d744d47b9ccaeef521a677 Mon Sep 17 00:00:00 2001
From: LoneRanger <836253168@qq.com>
Date: Mon, 20 Mar 2023 09:02:21 +0800
Subject: [PATCH 10/10] Update temporal_shift_kernel.h

fix bug
---
 paddle/phi/kernels/temporal_shift_kernel.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/phi/kernels/temporal_shift_kernel.h b/paddle/phi/kernels/temporal_shift_kernel.h
index 7c85ffd9783aa..a927d7fb23aae 100644
--- a/paddle/phi/kernels/temporal_shift_kernel.h
+++ b/paddle/phi/kernels/temporal_shift_kernel.h
@@ -15,7 +15,6 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {