From abc1c3d40c4041d56e59a6cdac4317c2ca95e74b Mon Sep 17 00:00:00 2001
From: lishicheng1996 <43111799+lishicheng1996@users.noreply.github.com>
Date: Tue, 27 Jun 2023 17:28:59 +0800
Subject: [PATCH] [BugFix] fix bugs in DCU unit tests (#54874)

* block bf16 tests on ROCM

* block more bf16 tests on ROCM

* some unittest cases doesn't have kernels on ROCm

* some unittest cases doesn't have kernels on ROCm

* fix code style
---
 test/legacy_test/test_assign_op.py            |  3 +-
 test/legacy_test/test_cast_op.py              |  8 +++++
 test/legacy_test/test_elementwise_mul_op.py   |  4 +++
 test/legacy_test/test_elementwise_pow_op.py   |  4 +++
 test/legacy_test/test_fill_any_like_op.py     |  3 +-
 test/legacy_test/test_layer_norm_op.py        | 33 +++++++++++++++++--
 test/legacy_test/test_matmul_v2_op.py         |  1 +
 test/legacy_test/test_reduce_op.py            | 11 ++++++-
 test/legacy_test/test_reshape_op.py           |  4 +++
 test/legacy_test/test_scale_op.py             |  3 +-
 ..._model_parallel_fused_multi_transformer.py |  5 ++-
 11 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/test/legacy_test/test_assign_op.py b/test/legacy_test/test_assign_op.py
index 9069b11669d3e..9299b07fc21e8 100644
--- a/test/legacy_test/test_assign_op.py
+++ b/test/legacy_test/test_assign_op.py
@@ -72,7 +72,8 @@ def test_backward(self):
 
 
 @unittest.skipIf(
-    not paddle.is_compiled_with_cuda(), "BFP16 test runs only on GPU"
+    not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BFP16 test runs only on CUDA",
 )
 class TestAssignBFP16Op(eager_op_test.OpTest):
     def setUp(self):
diff --git a/test/legacy_test/test_cast_op.py b/test/legacy_test/test_cast_op.py
index c830f5f9f81aa..dde01a2296c38 100644
--- a/test/legacy_test/test_cast_op.py
+++ b/test/legacy_test/test_cast_op.py
@@ -95,6 +95,10 @@ def test_grad(self):
         self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
 
 
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BFP16 test runs only on CUDA",
+)
 class TestCastOpBf16ToFp32(OpTest):
     def setUp(self):
         ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16')
@@ -120,6 +124,10 @@ def test_grad(self):
         self.check_grad(['X'], ['Out'], check_prim=True, only_check_prim=True)
 
 
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BFP16 test runs only on CUDA",
+)
 class TestCastOpFp32ToBf16(OpTest):
     def setUp(self):
         ipt = np.random.random(size=[10, 10]).astype('float32')
diff --git a/test/legacy_test/test_elementwise_mul_op.py b/test/legacy_test/test_elementwise_mul_op.py
index 8356d055c208c..987d15419109c 100644
--- a/test/legacy_test/test_elementwise_mul_op.py
+++ b/test/legacy_test/test_elementwise_mul_op.py
@@ -163,6 +163,10 @@ def init_input_output(self):
         self.out = np.multiply(self.x, self.y)
 
 
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BFP16 test runs only on CUDA",
+)
 class TestBF16ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
diff --git a/test/legacy_test/test_elementwise_pow_op.py b/test/legacy_test/test_elementwise_pow_op.py
index d450cc8a606d6..88297a2293a21 100644
--- a/test/legacy_test/test_elementwise_pow_op.py
+++ b/test/legacy_test/test_elementwise_pow_op.py
@@ -268,6 +268,10 @@ def test_check_grad(self):
         )
 
 
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BFP16 test runs only on CUDA",
+)
 class TestElementwisePowBF16Op(OpTest):
     def setUp(self):
         self.op_type = "elementwise_pow"
diff --git a/test/legacy_test/test_fill_any_like_op.py b/test/legacy_test/test_fill_any_like_op.py
index 36cf77195ccdb..31a3fa3836323 100644
--- a/test/legacy_test/test_fill_any_like_op.py
+++ b/test/legacy_test/test_fill_any_like_op.py
@@ -64,7 +64,8 @@ def if_enable_cinn(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+    not core.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "core is not compiled with CUDA",
 )
 class TestFillAnyLikeOpBfloat16(OpTest):
     def setUp(self):
diff --git a/test/legacy_test/test_layer_norm_op.py b/test/legacy_test/test_layer_norm_op.py
index 6fa2c41da3eea..32d23ad3e1c72 100644
--- a/test/legacy_test/test_layer_norm_op.py
+++ b/test/legacy_test/test_layer_norm_op.py
@@ -126,6 +126,10 @@ def layer_norm_wrapper(
     )
 
 
+@unittest.skipIf(
+    paddle.is_compiled_with_rocm(),
+    "ROCm doesn't support fp64 LayerNormOpByOp currently",
+)
 class TestLayerNormOpByOpTest(OpTest):
     def setUp(self):
         self.python_api = layer_norm_wrapper
@@ -164,7 +168,7 @@ def initConfig(self):
         self.cinn_rtol = 1e-5
 
         self.max_relative_error = 1e-5
-
+        # ROCm does not have float64 LayerNorm kernel
         self.dtype = "float64"
         self.x_shape = [2, 6, 6, 3]
         self.epsilon = 0.00001
@@ -218,6 +222,7 @@ def initTestCase(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
+    or paddle.is_compiled_with_rocm()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
     "core is not compiled with CUDA or not support the bfloat16",
 )
@@ -306,6 +311,10 @@ def initTestCase(self):
         }
 
 
+@unittest.skipIf(
+    paddle.is_compiled_with_rocm(),
+    "ROCm doesn't support fp64 LayerNormOpByOp currently",
+)
 class TestLayerNormOpByOpTestFP64_case2(TestLayerNormOpByOpTest):
     def initConfig(self):
         self.rev_comp_atol = 1e-6
@@ -328,6 +337,10 @@ def initConfig(self):
         self.has_bias = False
 
 
+@unittest.skipIf(
+    paddle.is_compiled_with_rocm(),
+    "ROCm doesn't support bf16 LayerNormOpByOp currently",
+)
 class TestLayerNormBF16OpByOpTest_case2(TestLayerNormBF16OpByOpTest):
     def initConfig(self):
         self.ori_atol = 1e-2
@@ -343,6 +356,10 @@ def initConfig(self):
         self.has_bias = False
 
 
+@unittest.skipIf(
+    paddle.is_compiled_with_rocm(),
+    "ROCm doesn't support fp64 LayerNormOpByOp currently",
+)
 class TestLayerNormOpByOpTestFP64_case3(TestLayerNormOpByOpTest):
     def initConfig(self):
         self.rev_comp_atol = 1e-7
@@ -365,6 +382,10 @@ def initConfig(self):
         self.has_bias = False
 
 
+@unittest.skipIf(
+    paddle.is_compiled_with_rocm(),
+    "ROCm doesn't support bf16 LayerNormOpByOp currently",
+)
 class TestLayerNormBF16OpByOpTest_case3(TestLayerNormBF16OpByOpTest):
     def initConfig(self):
         self.ori_atol = 1e-2
@@ -380,6 +401,10 @@ def initConfig(self):
         self.has_bias = False
 
 
+@unittest.skipIf(
+    paddle.is_compiled_with_rocm(),
+    "ROCm doesn't support fp64 LayerNormOpByOp currently",
+)
 class TestLayerNormOpByOpTestFP64_case4(TestLayerNormOpByOpTest):
     def initConfig(self):
         self.rev_comp_atol = 1e-6
@@ -801,6 +826,10 @@ def assert_equal(x, y):
         assert_equal(b_g_np_1, b_g_np_2)
 
 
+@unittest.skipIf(
+    not core.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BF16 is only supported on CUDA.",
+)
 class TestBF16ScaleBiasLayerNorm(unittest.TestCase):
     def check_main(self, x_np, weight_np, bias_np, dtype):
         paddle.disable_static()
@@ -934,7 +963,7 @@ def check_with_dtype(self, dtype):
         )
 
     def test_main(self):
-        if not paddle.is_compiled_with_cuda():
+        if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm():
             return
         self.check_with_dtype(dtype="float32")
         self.check_with_dtype(dtype="bfloat16")
diff --git a/test/legacy_test/test_matmul_v2_op.py b/test/legacy_test/test_matmul_v2_op.py
index f7b83fce17787..6adc3603fb03e 100644
--- a/test/legacy_test/test_matmul_v2_op.py
+++ b/test/legacy_test/test_matmul_v2_op.py
@@ -405,6 +405,7 @@ def test_check_grad(self):
 def create_test_bf16_class(parent, atol=0.01):
     @unittest.skipIf(
         not core.is_compiled_with_cuda()
+        or paddle.is_compiled_with_rocm()
         or not core.is_bfloat16_supported(core.CUDAPlace(0)),
         "core is not compiled with CUDA and not support the bfloat16",
     )
diff --git a/test/legacy_test/test_reduce_op.py b/test/legacy_test/test_reduce_op.py
index 95d5fb5ceb2a3..4320cfd2a5d47 100644
--- a/test/legacy_test/test_reduce_op.py
+++ b/test/legacy_test/test_reduce_op.py
@@ -198,7 +198,8 @@ def test_check_grad(self):
 
 def create_test_bf16_class(parent):
     @unittest.skipIf(
-        not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+        not core.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+        "core is not compiled with CUDA",
     )
     class TestSumOpBf16(parent):
         def setUp(self):
@@ -349,6 +350,7 @@ def init_dtype(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
+    or paddle.is_compiled_with_rocm()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
     "core is not compiled with CUDA or not support the bfloat16",
 )
@@ -449,6 +451,9 @@ def test_check_output(self):
     reason="reduce_min is discontinuous non-derivable function,"
     " its gradient check is not supported by unittest framework."
 )
+@unittest.skipIf(
+    paddle.is_compiled_with_rocm(), "ROCm doesn't have FP16 reduce_min kernel"
+)
 class TestMinFP16Op(OpTest):
     """Remove Min with subgradient from gradient check to confirm the success of CI."""
 
@@ -479,6 +484,7 @@ def test_check_output(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
+    or paddle.is_compiled_with_rocm()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
     "core is not compiled with CUDA or not support the bfloat16",
 )
@@ -541,6 +547,7 @@ def test_check_grad(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
+    or paddle.is_compiled_with_rocm()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
     "core is not compiled with CUDA or not support the bfloat16",
 )
@@ -648,6 +655,7 @@ def test_check_grad(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
+    or paddle.is_compiled_with_rocm()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
     "core is not compiled with CUDA or not support the bfloat16",
 )
@@ -721,6 +729,7 @@ def test_check_grad(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda()
+    or paddle.is_compiled_with_rocm()
     or not core.is_bfloat16_supported(core.CUDAPlace(0)),
     "core is not compiled with CUDA or not support the bfloat16",
 )
diff --git a/test/legacy_test/test_reshape_op.py b/test/legacy_test/test_reshape_op.py
index d5acc54d5721b..2feecb5005b14 100755
--- a/test/legacy_test/test_reshape_op.py
+++ b/test/legacy_test/test_reshape_op.py
@@ -86,6 +86,10 @@ def init_data(self):
         self.infered_shape = ()
 
 
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BFP16 test runs only on CUDA",
+)
 class TestReshapeBF16Op(OpTest):
     def setUp(self):
         self.init_data()
diff --git a/test/legacy_test/test_scale_op.py b/test/legacy_test/test_scale_op.py
index 40712745dec3d..7708ce8deaa88 100644
--- a/test/legacy_test/test_scale_op.py
+++ b/test/legacy_test/test_scale_op.py
@@ -155,7 +155,8 @@ def test_check_grad(self):
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_rocm(), "core is not compiled with CUDA"
+    not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(),
+    "BFP16 test runs only on CUDA",
 )
 class TestScaleBF16Op(OpTest):
     def setUp(self):
diff --git a/test/legacy_test/test_static_model_parallel_fused_multi_transformer.py b/test/legacy_test/test_static_model_parallel_fused_multi_transformer.py
index f4637b070cbf9..705680b531b30 100644
--- a/test/legacy_test/test_static_model_parallel_fused_multi_transformer.py
+++ b/test/legacy_test/test_static_model_parallel_fused_multi_transformer.py
@@ -34,7 +34,10 @@ def _setup_config(self):
     def test_dist_static_model_parallel_fused_multi_transformer(self):
         from paddle import fluid
 
-        if fluid.core.is_compiled_with_cuda():
+        if (
+            fluid.core.is_compiled_with_cuda()
+            and not paddle.is_compiled_with_rocm()
+        ):
             self.check_with_place(
                 "static_model_parallel_fused_multi_transformer.py",
                 delta=1e-5,