From 2a2b727bbddbfaa90aa5673f09059d0ecd2b8503 Mon Sep 17 00:00:00 2001
From: LiYuRio <liyuruijx@163.com>
Date: Wed, 6 Sep 2023 20:18:43 +0800
Subject: [PATCH 1/3] support reshard s to s

---
 paddle/fluid/pybind/auto_parallel_py.cc       |   5 +
 .../distributed/auto_parallel/CMakeLists.txt  |   3 +-
 .../auto_parallel/r_to_p_reshard_function.cc  |  16 +-
 .../distributed/auto_parallel/reshard_utils.h |  25 ++++
 .../auto_parallel/s_to_s_reshard_function.cc  | 140 ++++++++++++++++++
 .../auto_parallel/s_to_s_reshard_function.h   |  37 +++++
 paddle/phi/kernels/all_to_all_kernel.h        |  10 ++
 paddle/phi/kernels/cpu/all_to_all_kernel.cc   |   2 +
 .../phi/kernels/cpu/transpose_grad_kernel.cc  |   3 +
 paddle/phi/kernels/cpu/transpose_kernel.cc    |   2 +
 paddle/phi/kernels/gpu/all_to_all_kernel.cu   |   2 +
 .../phi/kernels/gpu/transpose_grad_kernel.cu  |   3 +
 paddle/phi/kernels/reshape_kernel.h           |  14 +-
 paddle/phi/kernels/transpose_kernel.h         |  19 ++-
 test/auto_parallel/CMakeLists.txt             |   3 +
 test/auto_parallel/reshard_s_to_s.py          |  78 ++++++++++
 test/auto_parallel/test_reshard_s_to_s.py     |  45 ++++++
 17 files changed, 383 insertions(+), 24 deletions(-)
 create mode 100644 paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
 create mode 100644 paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h
 create mode 100644 test/auto_parallel/reshard_s_to_s.py
 create mode 100644 test/auto_parallel/test_reshard_s_to_s.py
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index 8cf3a4dbbab07..739de2caf1138 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -36,6 +36,7 @@
 #include "paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.h"
+#include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h"
 
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
@@ -167,6 +168,10 @@ void BindAutoParallel(py::module *m) {
       *m, "PToRReshardFunction", ReshardFunction)
       .def(py::init<>());
 
+  py::class_<phi::distributed::SToSReshardFunction>(
+      *m, "SToSReshardFunction", ReshardFunction)
+      .def(py::init<>());
+
   py::class_<ProcessMesh>(*m, "ProcessMesh")
       .def(py::init<>())
       .def(py::init<const std::vector<int64_t> &,
diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
index 632dca0f0c01a..2d8dff6adb245 100644
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -15,4 +15,5 @@ collect_srcs(
   r_to_s_reshard_function.cc
   s_to_r_reshard_function.cc
   r_to_p_reshard_function.cc
-  p_to_r_reshard_function.cc)
+  p_to_r_reshard_function.cc
+  s_to_s_reshard_function.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
index d8681218ae68a..bd2cb4c58a46c 100644
--- a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
@@ -54,20 +54,8 @@ void RToPReshardFunction::Eval(phi::DeviceContext* dev_ctx,
     RESHARD_FUNCTOR(dev_ctx, Full, in.dtype(), shape, 0, GetMutableTensor(out));
   } else {
     // assign the input value to output
-    if (phi::CPUContext::classof(dev_ctx)) {
-      Assign(static_cast<const CPUContext&>(*dev_ctx),
-             in.value(),
-             GetMutableTensor(out));
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    } else if (phi::GPUContext::classof(dev_ctx)) {
-      Assign(static_cast<const GPUContext&>(*dev_ctx),
-             in.value(),
-             GetMutableTensor(out));
-#endif
-    } else {
-      PADDLE_THROW(phi::errors::Unimplemented(
-          "The assign in reshard only supported on CPU and GPU for now."));
-    }
+    RESHARD_FUNCTOR_WITHOUT_DTYPE(
+        dev_ctx, Assign, in.value(), GetMutableTensor(out));
   }
   SetDistProps(out, in.dims(), out_dist_attr);
 }
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
index 631d5d76d35f9..61a3fbbfcad87 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -104,5 +104,30 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
     RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \
   } while (0)
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)          \
+  do {                                                                \
+    if (phi::CPUContext::classof(dev_ctx)) {                          \
+      fn_name(static_cast<const CPUContext&>(*dev_ctx), __VA_ARGS__); \
+    } else if (phi::GPUContext::classof(dev_ctx)) {                   \
+      fn_name(static_cast<const GPUContext&>(*dev_ctx), __VA_ARGS__); \
+    } else {                                                          \
+      PADDLE_THROW(phi::errors::Unimplemented(                        \
+          "The %s in reshard only supported on CPU and GPU for now.", \
+          #fn_name));                                                 \
+    }                                                                 \
+  } while (0)
+#else
+#define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)              \
+  do {                                                                    \
+    if (phi::CPUContext::classof(dev_ctx)) {                              \
+      fn_name(static_cast<const CPUContext&>(*dev_ctx), __VA_ARGS__);     \
+    } else {                                                              \
+      PADDLE_THROW(phi::errors::Unimplemented(                            \
+          "The %s in reshard only supported on CPU for now.", #fn_name)); \
+    }                                                                     \
+  } while (0)
+#endif
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
new file mode 100644
index 0000000000000..de60a94f9452a
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h"
+
+#include "glog/logging.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
+#include "paddle/phi/kernels/all_to_all_kernel.h"
+#include "paddle/phi/kernels/reshape_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace distributed {
+
+bool SToSReshardFunction::IsSuitable(const DistTensor& in,
+                                     const TensorDistAttr& out_dist_attr) {
+  bool flag = true;
+  const auto& in_dist_attr = in.dist_attr();
+
+  flag &= in_dist_attr.is_shard();
+  flag &= out_dist_attr.is_shard();
+
+  const auto& in_process_mesh = in_dist_attr.process_mesh();
+  const auto& out_process_mesh = out_dist_attr.process_mesh();
+
+  flag &= (in_process_mesh.ndim() == 1);
+  flag &= (out_process_mesh.ndim() == 1);
+  flag &= (in_process_mesh == out_process_mesh);
+
+  return flag;
+}
+
+void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
+                               const DistTensor& in,
+                               const TensorDistAttr& out_dist_attr,
+                               DistTensor* out) {
+  const auto& in_process_mesh = in.dist_attr().process_mesh();
+  const auto& in_process_ids = in_process_mesh.process_ids();
+  auto dtype = in.dtype();
+  const auto& logical_ddim = in.dims();
+  int64_t nranks = in_process_ids.size();
+  int64_t in_split_axis =
+      GetSplitAxisWithDimsMapping(in.dist_attr().dims_mapping()).begin()->first;
+  int64_t out_split_axis =
+      GetSplitAxisWithDimsMapping(out_dist_attr.dims_mapping()).begin()->first;
+
+  DenseTensor in_all_to_all = in.value();
+  // 1. preprocess, reshape and transpose the input tensor
+  if (out_split_axis != 0) {
+    // 1.1 calc the shape and reshape
+    std::vector<int64_t> pre_shape_vec = vectorize(logical_ddim);
+    pre_shape_vec[in_split_axis] /= nranks;
+    pre_shape_vec[out_split_axis] /= nranks;
+    pre_shape_vec.insert(pre_shape_vec.begin() + out_split_axis, nranks);
+
+    DenseTensor out_reshape1;
+    RESHARD_FUNCTOR(
+        dev_ctx, Reshape, dtype, in.value(), pre_shape_vec, &out_reshape1);
+
+    // 1.2 calc the the desire axis and transpose
+    std::vector<int> axis;
+    axis.emplace_back(out_split_axis);
+    for (size_t i = 0; i < pre_shape_vec.size(); ++i) {
+      if (static_cast<int64_t>(i) != out_split_axis) {
+        axis.emplace_back(i);
+      }
+    }
+    DenseTensor out_transpose;
+    RESHARD_FUNCTOR(
+        dev_ctx, Transpose, dtype, out_reshape1, axis, &out_transpose);
+
+    // 1.3 calc the final shape and reshape
+    pre_shape_vec.erase(pre_shape_vec.begin() + out_split_axis);
+    pre_shape_vec[in_split_axis] *= nranks;
+    RESHARD_FUNCTOR(
+        dev_ctx, Reshape, dtype, out_transpose, pre_shape_vec, &in_all_to_all);
+  }
+
+  // 2. use all to all to switch data to other ranks
+  DenseTensor out_all_to_all;
+  RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                            AllToAll,
+                            dtype,
+                            in_process_ids,
+                            in_all_to_all,
+                            GetMutableTensor(out));
+
+  // 3. postprocess, reshape and transpose the output tensor
+  if (in_split_axis != 0) {
+    // 3.1 calc the shape and reshape
+    std::vector<int64_t> post_shape_vec = vectorize(logical_ddim);
+    post_shape_vec[in_split_axis] /= nranks;
+    post_shape_vec[out_split_axis] /= nranks;
+    post_shape_vec.insert(post_shape_vec.begin(), nranks);
+
+    DenseTensor out_reshape1;
+    RESHARD_FUNCTOR(
+        dev_ctx, Reshape, dtype, out->value(), post_shape_vec, &out_reshape1);
+
+    // 3.2 calc the the desire axis and transpose
+    std::vector<int> axis;
+    for (size_t i = 1; i < post_shape_vec.size(); ++i) {
+      axis.emplace_back(i);
+    }
+    axis.insert(axis.begin() + in_split_axis, 0);
+    DenseTensor out_transpose;
+    RESHARD_FUNCTOR(
+        dev_ctx, Transpose, dtype, out_reshape1, axis, &out_transpose);
+
+    // 3.3 calc the final shape and reshape
+    post_shape_vec.erase(post_shape_vec.begin());
+    post_shape_vec[in_split_axis] *= nranks;
+    RESHARD_FUNCTOR(dev_ctx,
+                    Reshape,
+                    dtype,
+                    out_transpose,
+                    post_shape_vec,
+                    GetMutableTensor(out));
+  }
+
+  SetDistProps(out, in.dims(), out_dist_attr);
+}
+
+REGISTER_RESHARD_FUNC(SToSReshardFunction);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h
new file mode 100644
index 0000000000000..b004ed919c192
--- /dev/null
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/reshard_function.h"
+
+namespace phi {
+namespace distributed {
+
+class SToSReshardFunction final : public ReshardFunction {
+ public:
+  SToSReshardFunction() = default;
+  ~SToSReshardFunction() = default;
+
+  bool IsSuitable(const DistTensor& in,
+                  const TensorDistAttr& out_dist_attr) override;
+
+  void Eval(DeviceContext* dev_ctx,
+            const DistTensor& in,
+            const TensorDistAttr& out_dist_attr,
+            DistTensor* out) override;
+};
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/kernels/all_to_all_kernel.h b/paddle/phi/kernels/all_to_all_kernel.h
index 3d8594b469c78..5444960b1f69e 100644
--- a/paddle/phi/kernels/all_to_all_kernel.h
+++ b/paddle/phi/kernels/all_to_all_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
@@ -23,4 +24,13 @@ void AllToAllKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void AllToAll(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  MetaTensor out_meta(*out);
+  MetaTensor* out_meta_ptr = &out_meta;
+
+  AllToAllInferMeta(phi::MetaTensor(x), out_meta_ptr);
+  AllToAllKernel<T, Context>(dev_ctx, x, out);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/all_to_all_kernel.cc b/paddle/phi/kernels/cpu/all_to_all_kernel.cc
index 5ea535c6e5e90..3407a1828e208 100644
--- a/paddle/phi/kernels/cpu/all_to_all_kernel.cc
+++ b/paddle/phi/kernels/cpu/all_to_all_kernel.cc
@@ -78,6 +78,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -90,6 +91,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    int,
                    bool,
                    int8_t,
+                   int16_t,
                    uint8_t,
                    int64_t,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
index 590d227d1e518..cc3340edcb4ab 100644
--- a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
@@ -28,6 +28,9 @@ PD_REGISTER_KERNEL(transpose_grad,
                    double,
                    int32_t,
                    int64_t,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
index 73359479770fd..bab9d47caa9aa 100644
--- a/paddle/phi/kernels/cpu/transpose_kernel.cc
+++ b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -87,7 +87,9 @@ PD_REGISTER_KERNEL(transpose,
                    double,
                    int32_t,
                    int64_t,
+                   uint8_t,
                    int8_t,
+                   int16_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
index 3e1008c408179..6d50e2ceb1ae5 100644
--- a/paddle/phi/kernels/gpu/all_to_all_kernel.cu
+++ b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
@@ -94,6 +94,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    int,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    bool,
                    phi::dtype::bfloat16,
@@ -108,6 +109,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    int,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    bool,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
index 35215ed5b96ed..c90d0bc40875b 100644
--- a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
@@ -26,6 +26,9 @@ PD_REGISTER_KERNEL(transpose_grad,
                    bool,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
                    int32_t,
                    int64_t,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h
index da8679cc39980..972d72ad706d9 100644
--- a/paddle/phi/kernels/reshape_kernel.h
+++ b/paddle/phi/kernels/reshape_kernel.h
@@ -41,14 +41,22 @@ void ReshapeStridedKernel(const Context& dev_ctx,
                           DenseTensor* out,
                           DenseTensor* xshape);
 
+template <typename T, typename Context>
+void Reshape(const Context& dev_ctx,
+             const DenseTensor& x,
+             const std::vector<int64_t>& shape,
+             DenseTensor* out) {
+  MetaTensor meta_out(out);
+  InferMetaFromVecValue(x, shape, &meta_out);
+  ReshapeInferKernel<Context>(dev_ctx, x, IntArray(shape), out);
+}
+
 template <typename T, typename Context>
 DenseTensor Reshape(const Context& dev_ctx,
                     const DenseTensor& x,
                     const std::vector<int64_t>& shape) {
   DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
-  InferMetaFromVecValue(x, shape, &meta_out);
-  ReshapeInferKernel<Context>(dev_ctx, x, IntArray(shape), &dense_out);
+  Reshape<T, Context>(dev_ctx, x, shape, &dense_out);
   return dense_out;
 }
 
diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h
index 2c576a85710cf..5555586c04387 100644
--- a/paddle/phi/kernels/transpose_kernel.h
+++ b/paddle/phi/kernels/transpose_kernel.h
@@ -34,17 +34,24 @@ void TransposeStridedKernel(const Context& dev_ctx,
                             DenseTensor* out);
 
 template <typename T, typename Context>
-DenseTensor Transpose(const Context& dev_ctx,
-                      const DenseTensor& x,
-                      const std::vector<int>& axis) {
-  DenseTensor dense_out;
-  MetaTensor meta_out(&dense_out);
+void Transpose(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& axis,
+               DenseTensor* dense_out) {
+  MetaTensor meta_out(dense_out);
   TransposeInferMeta(x, axis, &meta_out);
 
   // do not call TransposeStridedKernel, because some other kernels call
   // Transpose directly
-  TransposeKernel<T, Context>(dev_ctx, x, axis, &dense_out);
+  TransposeKernel<T, Context>(dev_ctx, x, axis, dense_out);
+}
 
+template <typename T, typename Context>
+DenseTensor Transpose(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const std::vector<int>& axis) {
+  DenseTensor dense_out;
+  Transpose<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
 
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
index 16ac0758ae96a..1f5f9177fc63f 100644
--- a/test/auto_parallel/CMakeLists.txt
+++ b/test/auto_parallel/CMakeLists.txt
@@ -82,6 +82,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_reshard_s_to_r MODULES test_reshard_s_to_r)
   set_tests_properties(test_reshard_s_to_r
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
+  py_test_modules(test_reshard_s_to_s MODULES test_reshard_s_to_s)
+  set_tests_properties(test_reshard_s_to_s
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
   py_test_modules(test_reshard_r_to_s MODULES test_reshard_r_to_s)
   set_tests_properties(test_reshard_r_to_s
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
diff --git a/test/auto_parallel/reshard_s_to_s.py b/test/auto_parallel/reshard_s_to_s.py
new file mode 100644
index 0000000000000..dfdea856ab8fe
--- /dev/null
+++ b/test/auto_parallel/reshard_s_to_s.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle.framework import core
+
+
+class TestReshardSToS:
+    def __init__(self):
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        self._backend = os.getenv("backend")
+        self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+    def test_body(self, in_shard, out_shard):
+        if self._backend == "cpu":
+            paddle.set_device("cpu")
+            place = paddle.CPUPlace()
+        elif self._backend == "gpu":
+            place = paddle.CUDAPlace(dist.get_rank())
+
+        dev_ctx = core.DeviceContext.create(place)
+        a = paddle.ones(self._shape)
+        in_shard_specs = [None for i in range(len(self._shape))]
+        in_shard_specs[in_shard] = "x"
+        out_shard_specs = [None for i in range(len(self._shape))]
+        out_shard_specs[out_shard] = "x"
+
+        dist_attr = dist.DistAttr(
+            mesh=self._mesh, sharding_specs=in_shard_specs
+        )
+        out_dist_attr = dist.DistAttr(
+            mesh=self._mesh, sharding_specs=out_shard_specs
+        )
+
+        input_tensor = dist.shard_tensor(a, dist_attr=dist_attr)
+
+        reshard_func = core.SToSReshardFunction()
+        assert reshard_func.is_suitable(input_tensor, out_dist_attr)
+
+        out = reshard_func.eval(dev_ctx, input_tensor, out_dist_attr)
+        out_shape = list(self._shape)
+
+        out_shape[out_shard] = out_shape[out_shard] // 2
+
+        assert np.equal(out.shape, input_tensor.shape).all()
+        assert np.equal(out._local_shape, out_shape).all()
+
+    def test_case1(self):
+        self.test_body(0, len(self._shape) - 1)
+
+    def test_case2(self):
+        self.test_body(len(self._shape) - 1, 0)
+
+    def run_test_case(self):
+        self.test_case1()
+        self.test_case2()
+
+
+if __name__ == '__main__':
+    TestReshardSToS().run_test_case()
diff --git a/test/auto_parallel/test_reshard_s_to_s.py b/test/auto_parallel/test_reshard_s_to_s.py
new file mode 100644
index 0000000000000..ab3f470a6568e
--- /dev/null
+++ b/test/auto_parallel/test_reshard_s_to_s.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestReshardSToS(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(num_of_devices=2, timeout=120)
+        self._default_envs = {
+            "shape": "(6, 20)",
+            "dtype": "float32",
+            "seeds": str(self._seeds),
+            "backend": "gpu",
+        }
+        self._changeable_envs = {
+            "shape": ["(6, 20)", "(6, 20, 10)"],
+        }
+
+    def test_reshard_s_to_s(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "reshard_s_to_s.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()

From 544b0397446b781e538fb64eed73a8cb1298fde6 Mon Sep 17 00:00:00 2001
From: LiYuRio <liyuruijx@163.com>
Date: Thu, 14 Sep 2023 11:07:54 +0800
Subject: [PATCH 2/3] refine, remove useless code

---
 .../distributed/auto_parallel/r_to_s_reshard_function.cc  | 6 +++---
 .../phi/core/distributed/auto_parallel/reshard_utils.cc   | 4 ++--
 paddle/phi/core/distributed/auto_parallel/reshard_utils.h | 2 +-
 .../distributed/auto_parallel/s_to_r_reshard_function.cc  | 8 ++------
 .../distributed/auto_parallel/s_to_s_reshard_function.cc  | 7 +++----
 5 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
index a3c616f5e8b35..381f77991f72d 100644
--- a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
@@ -51,11 +51,11 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
 
   DenseTensor out_physical_tensor_cur_rank;
 
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis =
+  std::map<int, int64_t> split_axis_to_mesh_axis =
       GetSplitAxisWithDimsMapping(out_dims_mapping);
   std::vector<int64_t> coord_in_mesh = GetCurRankCoordInMesh(out_process_mesh);
 
-  int64_t split_axis = split_axis_to_mesh_axis.begin()->first;
+  int split_axis = split_axis_to_mesh_axis.begin()->first;
   int64_t mesh_axis = split_axis_to_mesh_axis.begin()->second;
 
   int64_t num_of_process = out_process_mesh.shape()[mesh_axis];
@@ -65,7 +65,7 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
           << " process participate in.";
 
   std::vector<int64_t> split_num_vec =
-      BalancedSplit(in.dims()[static_cast<int>(split_axis)], num_of_process);
+      BalancedSplit(in.dims()[split_axis], num_of_process);
   IntArray sections(split_num_vec);
 
   std::vector<DenseTensor> split_out_vec;
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
index 1d9677a0a2bc5..2767dfa836394 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
@@ -112,9 +112,9 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
   return comm_context;
 }
 
-std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
+std::map<int, int64_t> GetSplitAxisWithDimsMapping(
     const std::vector<int64_t>& dims_mapping) {
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis;
+  std::map<int, int64_t> split_axis_to_mesh_axis;
   for (size_t i = 0; i < dims_mapping.size(); ++i) {
     if (dims_mapping[i] != -1) {
       split_axis_to_mesh_axis.emplace(i, dims_mapping[i]);
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
index 61a3fbbfcad87..b947c70bb5bc9 100644
--- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
+++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -40,7 +40,7 @@ std::vector<int64_t> GetCurRankCoordInMesh(const ProcessMesh& process_mesh);
 // input vector, return a key-value map of tensor_split_axis and
 // process_mesh_split_axis.
 // For example, if dims_mapping is [-1, 1, -1, 0], will return {1: 1, 3: 0}.
-std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
+std::map<int, int64_t> GetSplitAxisWithDimsMapping(
     const std::vector<int64_t>& dims_mapping);
 
 // If given a number, balance split it to multiple pieces.
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
index 20adea5ea90bb..efa5035c495ed 100644
--- a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
@@ -44,9 +44,7 @@ bool SToRReshardFunction::IsSuitable(const DistTensor& in,
 
   // Ensure the tensor is balanced split, or we need send/recv rather than
   // all_gather
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis =
-      GetSplitAxisWithDimsMapping(in_dims_mapping);
-  int64_t split_axis = split_axis_to_mesh_axis.begin()->first;
+  int split_axis = GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first;
   int64_t num_of_process = in_process_mesh.size();
   flag &= (in.local_dims()[static_cast<int>(split_axis)] * num_of_process ==
            in.dims()[static_cast<int>(split_axis)]);
@@ -74,9 +72,7 @@ void SToRReshardFunction::Eval(DeviceContext* dev_ctx,
                             in.value(),
                             in_process_ids.size(),
                             GetMutableTensor(out));
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis =
-      GetSplitAxisWithDimsMapping(in_dims_mapping);
-  int64_t split_axis = split_axis_to_mesh_axis.begin()->first;
+  int split_axis = GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first;
 
   if (split_axis == 0) {
     // If the input dist tensor is shard(0), the subsequent split
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
index de60a94f9452a..45ec290973446 100644
--- a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
@@ -14,7 +14,6 @@
 
 #include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h"
 
-#include "glog/logging.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
@@ -52,9 +51,9 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
   auto dtype = in.dtype();
   const auto& logical_ddim = in.dims();
   int64_t nranks = in_process_ids.size();
-  int64_t in_split_axis =
+  int in_split_axis =
       GetSplitAxisWithDimsMapping(in.dist_attr().dims_mapping()).begin()->first;
-  int64_t out_split_axis =
+  int out_split_axis =
       GetSplitAxisWithDimsMapping(out_dist_attr.dims_mapping()).begin()->first;
 
   DenseTensor in_all_to_all = in.value();
@@ -74,7 +73,7 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
     std::vector<int> axis;
     axis.emplace_back(out_split_axis);
     for (size_t i = 0; i < pre_shape_vec.size(); ++i) {
-      if (static_cast<int64_t>(i) != out_split_axis) {
+      if (static_cast<int>(i) != out_split_axis) {
         axis.emplace_back(i);
       }
     }

From 75df74b90e5be22a6c9eead2cf46a39f959f3725 Mon Sep 17 00:00:00 2001
From: LiYuRio <liyuruijx@163.com>
Date: Thu, 14 Sep 2023 14:26:21 +0800
Subject: [PATCH 3/3] reduce changable file

---
 .../auto_parallel/r_to_p_reshard_function.cc     | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
index bd2cb4c58a46c..d8681218ae68a 100644
--- a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
+++ b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc
@@ -54,8 +54,20 @@ void RToPReshardFunction::Eval(phi::DeviceContext* dev_ctx,
     RESHARD_FUNCTOR(dev_ctx, Full, in.dtype(), shape, 0, GetMutableTensor(out));
   } else {
     // assign the input value to output
-    RESHARD_FUNCTOR_WITHOUT_DTYPE(
-        dev_ctx, Assign, in.value(), GetMutableTensor(out));
+    if (phi::CPUContext::classof(dev_ctx)) {
+      Assign(static_cast<const CPUContext&>(*dev_ctx),
+             in.value(),
+             GetMutableTensor(out));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    } else if (phi::GPUContext::classof(dev_ctx)) {
+      Assign(static_cast<const GPUContext&>(*dev_ctx),
+             in.value(),
+             GetMutableTensor(out));
+#endif
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "The assign in reshard only supported on CPU and GPU for now."));
+    }
   }
   SetDistProps(out, in.dims(), out_dist_attr);
 }