diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 5e39b764fa96d..241fafbf9cdf5 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -432,6 +432,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [x]
+    spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : cos_grad
   backward : cos_double_grad
@@ -708,6 +709,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [out]
+    spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : exp_grad
   inplace : (out_grad -> x_grad)
@@ -1907,6 +1909,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [out]
+    spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : rsqrt_grad
   backward : rsqrt_double_grad
@@ -2062,6 +2065,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [x]
+    spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : silu_grad
   backward : silu_double_grad
@@ -2088,6 +2092,7 @@
   infer_meta :
     func : UnchangedInferMeta
     param : [x]
+    spmd_rule : ElementwiseUnaryGradInferSpmd
   kernel :
     func : sin_grad
   backward : sin_double_grad
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 0cb62f2a84c76..e5e3f9fb86c53 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -193,6 +193,7 @@
   infer_meta :
     func : GeneralBinaryGradInferMeta
     param : [x, y]
+    spmd_rule : ElementwiseBinaryGradInferSpmd
   kernel :
     func : divide_grad
   composite : divide_grad(x, y, out, out_grad, axis, x_grad, y_grad)
@@ -226,6 +227,7 @@
   infer_meta :
     func : GeneralBinaryGradInferMeta
     param: [x, y]
+    spmd_rule : ElementwiseBinaryGradInferSpmd
   composite : elementwise_pow_grad(x, y, out_grad, x_grad, y_grad)
   kernel :
     func : elementwise_pow_grad
@@ -440,6 +442,7 @@
   infer_meta :
     func : GeneralBinaryGradInferMeta
     param : [x, y]
+    spmd_rule : ElementwiseBinaryGradInferSpmd
   kernel :
     func : multiply_grad
   composite: multiply_grad(x, y, out_grad, axis, x_grad, y_grad)
diff --git a/paddle/phi/api/yaml/legacy_ops.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
index f43ae357df3e8..d37377890c9a9 100755
--- a/paddle/phi/api/yaml/legacy_ops.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -317,6 +317,7 @@
   output : Tensor(out)
   infer_meta :
     func : ElementwiseInferMeta
+    spmd_rule : ElementwiseBinaryInferSpmd
   kernel :
     func : divide
   inplace: (x -> out)
@@ -348,6 +349,7 @@
   output : Tensor(out)
   infer_meta :
     func : ElementwiseInferMeta
+    spmd_rule: ElementwiseBinaryInferSpmd
   kernel :
     func : elementwise_pow
   backward : elementwise_pow_grad
@@ -415,6 +417,7 @@
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
+    spmd_rule: ElementwiseBinaryInferSpmd
   kernel :
     func : equal
   inplace: (x -> out)
@@ -783,6 +786,7 @@
   output : Tensor
   infer_meta :
     func : ElementwiseInferMeta
+    spmd_rule : ElementwiseBinaryInferSpmd
   kernel :
     func : multiply {dense, dense -> dense},
            multiply_sr {selected_rows, dense -> selected_rows}
@@ -803,6 +807,7 @@
   output : Tensor(out)
   infer_meta :
     func : CompareInferMeta
+    spmd_rule : ElementwiseBinaryInferSpmd
   kernel :
     func : not_equal
   inplace: (x -> out)
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 97d4f3dc0d17e..e6b11884f74eb 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -327,6 +327,7 @@
   output : Tensor(out)
   infer_meta :
     func : ElementwiseInferMeta
+    spmd_rule : ElementwiseBinaryInferSpmd
   kernel :
     func : bitwise_and
     backend : x
@@ -337,6 +338,7 @@
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : bitwise_not
     backend : x
@@ -550,6 +552,7 @@
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : cos
   inplace: (x -> out)
@@ -819,6 +822,7 @@
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : exp
   inplace : (x -> out)
@@ -2180,6 +2184,7 @@
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : rsqrt
   inplace : (x -> out)
@@ -2360,6 +2365,7 @@
   output : Tensor
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : silu
   backward : silu_grad
@@ -2369,6 +2375,7 @@
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
+    spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : sin
   inplace: (x -> out)
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
index f4a4cd68ce5e1..0ee6a513c0e48 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -76,14 +76,14 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
   do {                                                                \
     if (phi::CPUContext::classof(dev_ctx)) {                          \
       VLOG(4) << "Call `" << #fn_name << "` in Resharding on GPU.";   \
-      PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                           \
+      PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(                  \
           dtype, #fn_name, ([&] {                                     \
             fn_name<data_t>(static_cast<const CPUContext&>(*dev_ctx), \
                             __VA_ARGS__);                             \
           }));                                                        \
     } else if (phi::GPUContext::classof(dev_ctx)) {                   \
       VLOG(4) << "Call `" << #fn_name << "` in Resharding on CPU.";   \
-      PD_VISIT_FLOATING_AND_INTEGRAL_TYPES(                           \
+      PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(                  \
           dtype, #fn_name, ([&] {                                     \
             fn_name<data_t>(static_cast<const GPUContext&>(*dev_ctx), \
                             __VA_ARGS__);                             \
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index c5612b203d233..5206da6ec3785 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -148,6 +148,33 @@ namespace phi {
     }                                                                         \
   }()
 
+///////// BOOL and Floating and Integral Dispatch Marco ///////////
+
+#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)        \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::BOOL, bool, __VA_ARGS__)    \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT32, float, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::FLOAT64, double, __VA_ARGS__)             \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT64, int64_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT8, int8_t, __VA_ARGS__)                \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::UINT8, uint8_t, __VA_ARGS__)              \
+      PD_PRIVATE_CASE_TYPE(                                                   \
+          NAME, ::paddle::DataType::INT16, int16_t, __VA_ARGS__)              \
+      default:                                                                \
+        PD_THROW("function " #NAME " is not implemented for data type `",     \
+                 __dtype__,                                                   \
+                 "`");                                                        \
+    }                                                                         \
+  }()
+
 ///////// Floating and Complex Dispatch Marco ///////////
 
 #define PD_VISIT_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)              \
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.cc b/paddle/phi/infermeta/spmd_rules/elementwise.cc
index 3a9e422320210..9ec18bdaf50ce 100644
--- a/paddle/phi/infermeta/spmd_rules/elementwise.cc
+++ b/paddle/phi/infermeta/spmd_rules/elementwise.cc
@@ -314,6 +314,13 @@ SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
   return {{out_grad.dist_attr(), out_grad.dist_attr()}, {out_grad.dist_attr()}};
 }
 
+SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
+                                       const DistMetaTensor& out,
+                                       const DistMetaTensor& out_grad) {
+  return {{out_grad.dist_attr(), out_grad.dist_attr(), out_grad.dist_attr()},
+          {out_grad.dist_attr()}};
+}
+
 SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
                                         const DistMetaTensor& y,
                                         const DistMetaTensor& out_grad,
@@ -381,5 +388,17 @@ SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
           {x_grad_dist_attr, y_grad_dist_attr}};
 }
 
+SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
+                                        const DistMetaTensor& y,
+                                        const DistMetaTensor& out,
+                                        const DistMetaTensor& out_grad,
+                                        int64_t axis) {
+  // The out's dist_attr is the same with out_grad's dist_attr, reuse
+  // ElementwiseBinaryGradInferSpmd(x, y, out_grad, axis) to infer dist_attrs of
+  // {{x, y, out_grad}, {x_grad, y_grad}}, then insert out's dist_attr into it.
+  SpmdInfo info = ElementwiseBinaryGradInferSpmd(x, y, out_grad, axis);
+  info.first.emplace(info.first.begin() + 2, out_grad.dist_attr());
+  return info;
+}
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.h b/paddle/phi/infermeta/spmd_rules/elementwise.h
index 637c3b793b6c4..2dd8d4c764a40 100644
--- a/paddle/phi/infermeta/spmd_rules/elementwise.h
+++ b/paddle/phi/infermeta/spmd_rules/elementwise.h
@@ -30,6 +30,10 @@ SpmdInfo ElementwiseUnaryInferSpmdReverse(const DistMetaTensor& x,
 SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
                                        const DistMetaTensor& out_grad);
 
+SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
+                                       const DistMetaTensor& out,
+                                       const DistMetaTensor& out_grad);
+
 SpmdInfo ElementwiseBinaryInferSpmd(const DistMetaTensor& x,
                                     const DistMetaTensor& y);
 
@@ -42,5 +46,11 @@ SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
                                         const DistMetaTensor& out_grad,
                                         int64_t axis = -1);
 
+SpmdInfo ElementwiseBinaryGradInferSpmd(const DistMetaTensor& x,
+                                        const DistMetaTensor& y,
+                                        const DistMetaTensor& out,
+                                        const DistMetaTensor& out_grad,
+                                        int64_t axis = -1);
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
index 1a9a737866153..f09e09a1a14aa 100644
--- a/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_divide_grad_kernel.cc
@@ -50,6 +50,7 @@ PD_REGISTER_KERNEL(divide_grad,
                    int16_t,
                    int,
                    int64_t,
+                   bool,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -61,5 +62,6 @@ PD_REGISTER_KERNEL(divide_double_grad,
                    double,
                    int,
                    int64_t,
+                   bool,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
index a5fc4552bfbf2..b7fdefe023e73 100644
--- a/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/cpu/elementwise_divide_kernel.cc
@@ -64,5 +64,6 @@ PD_REGISTER_KERNEL(divide,
                    int16_t,
                    int,
                    int64_t,
+                   bool,
                    complex64,
                    complex128) {}
diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
index 783d94e8e7bb2..3261243c986c0 100644
--- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
@@ -329,6 +329,7 @@ PD_REGISTER_KERNEL(divide_grad,
                    int16_t,
                    int,
                    int64_t,
+                   bool,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
@@ -342,6 +343,7 @@ PD_REGISTER_KERNEL(divide_double_grad,
                    double,
                    int,
                    int64_t,
+                   bool,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
diff --git a/paddle/phi/kernels/kps/elementwise_kernel.cu b/paddle/phi/kernels/kps/elementwise_kernel.cu
index d40f1bd7a7062..6de33dd78d2d0 100644
--- a/paddle/phi/kernels/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/kps/elementwise_kernel.cu
@@ -312,6 +312,7 @@ PD_REGISTER_KERNEL(divide,
                    int16_t,
                    int,
                    int64_t,
+                   bool,
                    float16,
                    bfloat16,
                    complex64,
diff --git a/paddle/phi/kernels/legacy/cpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/legacy/cpu/elementwise_divide_kernel.cc
index ad09d6830f974..6f4debdcb216f 100644
--- a/paddle/phi/kernels/legacy/cpu/elementwise_divide_kernel.cc
+++ b/paddle/phi/kernels/legacy/cpu/elementwise_divide_kernel.cc
@@ -62,5 +62,6 @@ PD_REGISTER_KERNEL(divide_raw,
                    double,
                    int,
                    int64_t,
+                   bool,
                    complex64,
                    complex128) {}
diff --git a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
index 394d525b15f0f..ad802ee190861 100644
--- a/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
+++ b/paddle/phi/kernels/legacy/kps/elementwise_kernel.cu
@@ -82,6 +82,7 @@ PD_REGISTER_KERNEL(divide_raw,
                    int16_t,
                    int,
                    int64_t,
+                   bool,
                    float16,
                    bfloat16,
                    complex64,
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 6d7c8c149435b..aa1212c9ecc11 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -118,7 +118,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_semi_auto_parallel_basic MODULES
                   test_semi_auto_parallel_basic)
   set_tests_properties(test_semi_auto_parallel_basic
-                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 120)
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 200)
   py_test_modules(test_semi_auto_parallel_single_strategy MODULES
                   test_semi_auto_parallel_single_strategy)
   set_tests_properties(test_semi_auto_parallel_single_strategy
diff --git a/test/auto_parallel/semi_auto_parallel_for_bitwise.py b/test/auto_parallel/semi_auto_parallel_for_bitwise.py
new file mode 100644
index 0000000000000..1cbc6654b53b5
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_for_bitwise.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestBitwiseApiForSemiAutoParallel:
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        self._check_grad = False
+        self._rtol = 1e-6
+        self._atol = 0.0
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(
+            np1, np2, rtol=self._rtol, atol=self._atol, verbose=True
+        )
+
+    def test_unary_body(self, x_shape, out_shape, x_specs, unary_func):
+        x = paddle.randint(0, 100, x_shape, self._dtype)
+        x.stop_gradient = False
+
+        x_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=x_specs)
+
+        dist_x = dist.shard_tensor(x, dist_attr=x_dist_attr)
+        dist_x.stop_gradient = False
+
+        dist_out = unary_func(dist_x)
+        out = unary_func(x)
+        self.check_tensor_eq(out, dist_out)
+        if self._check_grad:
+            dist_out.backward()
+            out.backward()
+            self.check_tensor_eq(x.grad, dist_x.grad)
+
+    def test_binary_body(
+        self, x_shape, y_shape, out_shape, x_specs, y_specs, binary_func
+    ):
+        x = paddle.randint(0, 100, x_shape, self._dtype)
+        y = paddle.randint(0, 100, y_shape, self._dtype)
+        x.stop_gradient = False
+        y.stop_gradient = False
+
+        x_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=x_specs)
+        y_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=y_specs)
+
+        dist_x = dist.shard_tensor(x, dist_attr=x_dist_attr)
+        dist_y = dist.shard_tensor(y, dist_attr=y_dist_attr)
+        dist_x.stop_gradient = False
+        dist_y.stop_gradient = False
+
+        dist_out = binary_func(dist_x, dist_y)
+        out = binary_func(x, y)
+        self.check_tensor_eq(out, dist_out)
+
+        if self._check_grad:
+            dist_out.backward()
+            out.backward()
+            self.check_tensor_eq(x.grad, dist_x.grad)
+            self.check_tensor_eq(y.grad, dist_y.grad)
+
+    def test_bitwise_and_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_bitwise_and_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_bitwise_and_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_bitwise_and_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[4, 16, 32],
+            y_shape=[16, 32],
+            out_shape=[4, 16, 32],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_bitwise_not_x_shard(self):
+        self.test_unary_body(
+            x_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            unary_func=paddle.bitwise_not,
+        )
+
+    def test_bitwise_not_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.bitwise_not,
+        )
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.test_bitwise_and_x_shard()
+        self.test_bitwise_and_x_shard_broadcast()
+        self.test_bitwise_and_x_y_shard()
+        self.test_bitwise_and_x_y_shard_broadcast()
+        self.test_bitwise_not_x_shard()
+
+
+if __name__ == '__main__':
+    TestBitwiseApiForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/semi_auto_parallel_for_compare.py b/test/auto_parallel/semi_auto_parallel_for_compare.py
new file mode 100644
index 0000000000000..a174a9c9180e1
--- /dev/null
+++ b/test/auto_parallel/semi_auto_parallel_for_compare.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+
+
+class TestCompareApiForSemiAutoParallel:
+    def __init__(self):
+        self._dtype = os.getenv("dtype")
+        self._backend = os.getenv("backend")
+        self._seed = eval(os.getenv("seed"))
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+        self._check_grad = False
+        self._rtol = 1e-6
+        self._atol = 0.0
+        paddle.seed(self._seed)
+        np.random.seed(self._seed)
+
+    def check_tensor_eq(self, a, b):
+        np1 = a.numpy()
+        np2 = b.numpy()
+        np.testing.assert_allclose(
+            np1, np2, rtol=self._rtol, atol=self._atol, verbose=True
+        )
+
+    def test_binary_body(
+        self, x_shape, y_shape, out_shape, x_specs, y_specs, binary_func
+    ):
+        x = paddle.randn(x_shape, self._dtype)
+        y = paddle.randn(y_shape, self._dtype)
+        x.stop_gradient = False
+        y.stop_gradient = False
+
+        x_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=x_specs)
+        y_dist_attr = dist.DistAttr(mesh=self._mesh, sharding_specs=y_specs)
+
+        dist_x = dist.shard_tensor(x, dist_attr=x_dist_attr)
+        dist_y = dist.shard_tensor(y, dist_attr=y_dist_attr)
+        dist_x.stop_gradient = False
+        dist_y.stop_gradient = False
+
+        dist_out = binary_func(dist_x, dist_y)
+        out = binary_func(x, y)
+        self.check_tensor_eq(out, dist_out)
+
+        if self._check_grad:
+            dist_out.backward()
+            out.backward()
+            self.check_tensor_eq(x.grad, dist_x.grad)
+            self.check_tensor_eq(y.grad, dist_y.grad)
+
+    def test_equal_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.equal,
+        )
+
+    def test_equal_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.equal,
+        )
+
+    def test_equal_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.equal,
+        )
+
+    def test_equal_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[4, 16, 32],
+            y_shape=[16, 32],
+            out_shape=[4, 16, 32],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.equal,
+        )
+
+    def test_not_equal_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.not_equal,
+        )
+
+    def test_not_equal_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.not_equal,
+        )
+
+    def test_not_equal_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.not_equal,
+        )
+
+    def test_not_equal_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[4, 16, 32],
+            y_shape=[16, 32],
+            out_shape=[4, 16, 32],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.not_equal,
+        )
+
+    def run_test_case(self):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+        elif self._backend == "gpu":
+            paddle.set_device("gpu:" + str(dist.get_rank()))
+        else:
+            raise ValueError("Only support cpu or gpu backend.")
+
+        self.test_equal_x_shard()
+        self.test_equal_x_shard_broadcast()
+        self.test_equal_x_y_shard()
+        self.test_equal_x_y_shard_broadcast()
+
+        self.test_not_equal_x_shard()
+        self.test_not_equal_x_shard_broadcast()
+        self.test_not_equal_x_y_shard()
+        self.test_not_equal_x_y_shard_broadcast()
+
+
+if __name__ == '__main__':
+    TestCompareApiForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/semi_auto_parallel_for_elementwise.py b/test/auto_parallel/semi_auto_parallel_for_elementwise.py
index 2a55f7d02df03..0e737db45ecaf 100644
--- a/test/auto_parallel/semi_auto_parallel_for_elementwise.py
+++ b/test/auto_parallel/semi_auto_parallel_for_elementwise.py
@@ -27,14 +27,17 @@ def __init__(self):
         self._backend = os.getenv("backend")
         self._seed = eval(os.getenv("seed"))
         self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
-
+        self._rtol = 1e-6
+        self._atol = 0.0
         paddle.seed(self._seed)
         np.random.seed(self._seed)
 
     def check_tensor_eq(self, a, b):
         np1 = a.numpy()
         np2 = b.numpy()
-        np.testing.assert_allclose(np1, np2, rtol=1e-05, verbose=True)
+        np.testing.assert_allclose(
+            np1, np2, rtol=self._rtol, atol=self._atol, verbose=True
+        )
 
     def test_unary_body(self, x_shape, out_shape, x_specs, unary_func):
         x = paddle.randn(x_shape, self._dtype)
@@ -100,9 +103,9 @@ def test_sub_x_shard(self):
 
     def test_add_x_shard_broadcast(self):
         self.test_binary_body(
-            x_shape=[16, 32],
-            y_shape=[2, 16, 32],
-            out_shape=[2, 16, 32],
+            x_shape=[8, 16],
+            y_shape=[2, 8, 16],
+            out_shape=[2, 8, 16],
             x_specs=['x', None],
             y_specs=[None, None, None],
             binary_func=paddle.add,
@@ -206,6 +209,256 @@ def test_maximum_x_y_shard_broadcast(self):
             binary_func=paddle.maximum,
         )
 
+    def test_multiply_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.multiply,
+        )
+
+    def test_multiply_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.multiply,
+        )
+
+    def test_multiply_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.multiply,
+        )
+
+    def test_multiply_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[4, 6, 8],
+            y_shape=[6, 8],
+            out_shape=[4, 6, 8],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.multiply,
+        )
+
+    def test_divide_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.divide,
+        )
+
+    def test_divide_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.divide,
+        )
+
+    def test_divide_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.divide,
+        )
+
+    def test_divide_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[2, 4, 6],
+            y_shape=[4, 6],
+            out_shape=[2, 4, 6],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.divide,
+        )
+
+    def test_bitwise_and_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_bitwise_and_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_bitwise_and_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_bitwise_and_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[4, 16, 32],
+            y_shape=[16, 32],
+            out_shape=[4, 16, 32],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.bitwise_and,
+        )
+
+    def test_elementwise_pow_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.pow,
+        )
+
+    def test_elementwise_pow_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.pow,
+        )
+
+    def test_elementwise_pow_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.pow,
+        )
+
+    def test_elementwise_pow_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[4, 6, 8],
+            y_shape=[6, 8],
+            out_shape=[4, 6, 8],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.pow,
+        )
+
+    def test_equal_x_shard(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None],
+            binary_func=paddle.equal,
+        )
+
+    def test_equal_x_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[2, 16, 32],
+            out_shape=[2, 16, 32],
+            x_specs=['x', None],
+            y_specs=[None, None, None],
+            binary_func=paddle.equal,
+        )
+
+    def test_equal_x_y_shard(self):
+        if self._backend == "cpu":
+            return
+        self.test_binary_body(
+            x_shape=[16, 32],
+            y_shape=[16, 32],
+            out_shape=[16, 32],
+            x_specs=['x', None],
+            y_specs=[None, 'x'],
+            binary_func=paddle.equal,
+        )
+
+    def test_equal_x_y_shard_broadcast(self):
+        self.test_binary_body(
+            x_shape=[2, 6, 4],
+            y_shape=[6, 4],
+            out_shape=[2, 6, 4],
+            x_specs=['x', None, None],
+            y_specs=[None, None],
+            binary_func=paddle.equal,
+        )
+
+    def test_exp_x_shard(self):
+        self.test_unary_body(
+            x_shape=[4, 16],
+            out_shape=[4, 16],
+            x_specs=['x', None],
+            unary_func=paddle.exp,
+        )
+
+    def test_rsqrt_x_shard(self):
+        self.test_unary_body(
+            x_shape=[4, 16],
+            out_shape=[4, 16],
+            x_specs=['x', None],
+            unary_func=paddle.rsqrt,
+        )
+
+    def test_silu_x_shard(self):
+        self.test_unary_body(
+            x_shape=[4, 16],
+            out_shape=[4, 16],
+            x_specs=['x', None],
+            unary_func=paddle.nn.functional.silu,
+        )
+
+    def test_sin_x_shard(self):
+        self.test_unary_body(
+            x_shape=[4, 16],
+            out_shape=[4, 16],
+            x_specs=['x', None],
+            unary_func=paddle.sin,
+        )
+
+    def test_cos_x_shard(self):
+        self.test_unary_body(
+            x_shape=[4, 16],
+            out_shape=[4, 16],
+            x_specs=['x', None],
+            unary_func=paddle.cos,
+        )
+
     def run_test_case(self):
         if self._backend == "cpu":
             paddle.set_device("cpu")
@@ -222,6 +475,27 @@ def run_test_case(self):
         self.test_sub_x_y_shard_broadcast()
         self.test_square_x_shard()
         self.test_relu_x_shard()
+        self.test_maximum_x_shard()
+        self.test_maximum_x_shard_broadcast()
+        self.test_maximum_x_y_shard()
+        self.test_maximum_x_y_shard_broadcast()
+        self.test_multiply_x_shard()
+        self.test_multiply_x_shard_broadcast()
+        self.test_multiply_x_y_shard()
+        self.test_multiply_x_y_shard_broadcast()
+        self.test_divide_x_shard()
+        self.test_divide_x_shard_broadcast()
+        self.test_divide_x_y_shard()
+        self.test_divide_x_y_shard_broadcast()
+        self.test_elementwise_pow_x_shard()
+        self.test_elementwise_pow_x_shard_broadcast()
+        self.test_elementwise_pow_x_y_shard()
+        self.test_elementwise_pow_x_y_shard_broadcast()
+        self.test_exp_x_shard()
+        self.test_rsqrt_x_shard()
+        self.test_silu_x_shard()
+        self.test_sin_x_shard()
+        self.test_cos_x_shard()
 
 
 if __name__ == '__main__':
diff --git a/test/auto_parallel/test_semi_auto_parallel_basic.py b/test/auto_parallel/test_semi_auto_parallel_basic.py
index 56cabcb318f3d..3730e019f7506 100644
--- a/test/auto_parallel/test_semi_auto_parallel_basic.py
+++ b/test/auto_parallel/test_semi_auto_parallel_basic.py
@@ -56,6 +56,16 @@ def test_reduction_api(self):
                 user_defined_envs=envs,
             )
 
+    def test_bitwise_api(self):
+        envs_list = test_base.gen_product_envs_list(
+            {"dtype": "int32", "seed": "2023"}, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_for_bitwise.py",
+                user_defined_envs=envs,
+            )
+
     def test_several_replicated_spmd_api(self):
         envs_list = test_base.gen_product_envs_list(
             self._default_envs, self._changeable_envs