PaddlePaddle · LiYuRio · Sep 15, 2023 · Sep 6, 2023 · Sep 14, 2023 · Sep 14, 2023
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -36,6 +36,7 @@
 #include "paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h"
 #include "paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.h"
+#include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h"
 
 #ifdef PADDLE_WITH_DISTRIBUTE
 #include "paddle/phi/infermeta/spmd_rules/rules.h"
@@ -167,6 +168,10 @@ void BindAutoParallel(py::module *m) {
       *m, "PToRReshardFunction", ReshardFunction)
       .def(py::init<>());
 
+  py::class_<phi::distributed::SToSReshardFunction>(
+      *m, "SToSReshardFunction", ReshardFunction)
+      .def(py::init<>());
+
   py::class_<ProcessMesh>(*m, "ProcessMesh")
       .def(py::init<>())
       .def(py::init<const std::vector<int64_t> &,

diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -15,4 +15,5 @@ collect_srcs(
   r_to_s_reshard_function.cc
   s_to_r_reshard_function.cc
   r_to_p_reshard_function.cc
-  p_to_r_reshard_function.cc)
+  p_to_r_reshard_function.cc
+  s_to_s_reshard_function.cc)
diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc
@@ -51,11 +51,11 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
 
   DenseTensor out_physical_tensor_cur_rank;
 
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis =
+  std::map<int, int64_t> split_axis_to_mesh_axis =
       GetSplitAxisWithDimsMapping(out_dims_mapping);
   std::vector<int64_t> coord_in_mesh = GetCurRankCoordInMesh(out_process_mesh);
 
-  int64_t split_axis = split_axis_to_mesh_axis.begin()->first;
+  int split_axis = split_axis_to_mesh_axis.begin()->first;
   int64_t mesh_axis = split_axis_to_mesh_axis.begin()->second;
 
   int64_t num_of_process = out_process_mesh.shape()[mesh_axis];
@@ -65,7 +65,7 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
           << " process participate in.";
 
   std::vector<int64_t> split_num_vec =
-      BalancedSplit(in.dims()[static_cast<int>(split_axis)], num_of_process);
+      BalancedSplit(in.dims()[split_axis], num_of_process);
   IntArray sections(split_num_vec);
 
   std::vector<DenseTensor> split_out_vec;

diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc
@@ -112,9 +112,9 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
   return comm_context;
 }
 
-std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
+std::map<int, int64_t> GetSplitAxisWithDimsMapping(
     const std::vector<int64_t>& dims_mapping) {
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis;
+  std::map<int, int64_t> split_axis_to_mesh_axis;
   for (size_t i = 0; i < dims_mapping.size(); ++i) {
     if (dims_mapping[i] != -1) {
       split_axis_to_mesh_axis.emplace(i, dims_mapping[i]);

diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h
@@ -40,7 +40,7 @@ std::vector<int64_t> GetCurRankCoordInMesh(const ProcessMesh& process_mesh);
 // input vector, return a key-value map of tensor_split_axis and
 // process_mesh_split_axis.
 // For example, if dims_mapping is [-1, 1, -1, 0], will return {1: 1, 3: 0}.
-std::map<int64_t, int64_t> GetSplitAxisWithDimsMapping(
+std::map<int, int64_t> GetSplitAxisWithDimsMapping(
     const std::vector<int64_t>& dims_mapping);
 
 // If given a number, balance split it to multiple pieces.
@@ -104,5 +104,30 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx,
     RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \
   } while (0)
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)          \
+  do {                                                                \
+    if (phi::CPUContext::classof(dev_ctx)) {                          \
+      fn_name(static_cast<const CPUContext&>(*dev_ctx), __VA_ARGS__); \
+    } else if (phi::GPUContext::classof(dev_ctx)) {                   \
+      fn_name(static_cast<const GPUContext&>(*dev_ctx), __VA_ARGS__); \
+    } else {                                                          \
+      PADDLE_THROW(phi::errors::Unimplemented(                        \
+          "The %s in reshard only supported on CPU and GPU for now.", \
+          #fn_name));                                                 \
+    }                                                                 \
+  } while (0)
+#else
+#define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...)              \
+  do {                                                                    \
+    if (phi::CPUContext::classof(dev_ctx)) {                              \
+      fn_name(static_cast<const CPUContext&>(*dev_ctx), __VA_ARGS__);     \
+    } else {                                                              \
+      PADDLE_THROW(phi::errors::Unimplemented(                            \
+          "The %s in reshard only supported on CPU for now.", #fn_name)); \
+    }                                                                     \
+  } while (0)
+#endif
+
 }  // namespace distributed
 }  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc
@@ -44,9 +44,7 @@ bool SToRReshardFunction::IsSuitable(const DistTensor& in,
 
   // Ensure the tensor is balanced split, or we need send/recv rather than
   // all_gather
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis =
-      GetSplitAxisWithDimsMapping(in_dims_mapping);
-  int64_t split_axis = split_axis_to_mesh_axis.begin()->first;
+  int split_axis = GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first;
   int64_t num_of_process = in_process_mesh.size();
   flag &= (in.local_dims()[static_cast<int>(split_axis)] * num_of_process ==
            in.dims()[static_cast<int>(split_axis)]);
@@ -74,9 +72,7 @@ void SToRReshardFunction::Eval(DeviceContext* dev_ctx,
                             in.value(),
                             in_process_ids.size(),
                             GetMutableTensor(out));
-  std::map<int64_t, int64_t> split_axis_to_mesh_axis =
-      GetSplitAxisWithDimsMapping(in_dims_mapping);
-  int64_t split_axis = split_axis_to_mesh_axis.begin()->first;
+  int split_axis = GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first;
 
   if (split_axis == 0) {
     // If the input dist tensor is shard(0), the subsequent split

diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc
@@ -0,0 +1,139 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h"
+
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h"
+#include "paddle/phi/kernels/all_to_all_kernel.h"
+#include "paddle/phi/kernels/reshape_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+
+namespace phi {
+namespace distributed {
+
+bool SToSReshardFunction::IsSuitable(const DistTensor& in,
+                                     const TensorDistAttr& out_dist_attr) {
+  bool flag = true;
+  const auto& in_dist_attr = in.dist_attr();
+
+  flag &= in_dist_attr.is_shard();
+  flag &= out_dist_attr.is_shard();
+
+  const auto& in_process_mesh = in_dist_attr.process_mesh();
+  const auto& out_process_mesh = out_dist_attr.process_mesh();
+
+  flag &= (in_process_mesh.ndim() == 1);
+  flag &= (out_process_mesh.ndim() == 1);
+  flag &= (in_process_mesh == out_process_mesh);
+
+  return flag;
+}
+
+void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx,
+                               const DistTensor& in,
+                               const TensorDistAttr& out_dist_attr,
+                               DistTensor* out) {
+  const auto& in_process_mesh = in.dist_attr().process_mesh();
+  const auto& in_process_ids = in_process_mesh.process_ids();
+  auto dtype = in.dtype();
+  const auto& logical_ddim = in.dims();
+  int64_t nranks = in_process_ids.size();
+  int in_split_axis =
+      GetSplitAxisWithDimsMapping(in.dist_attr().dims_mapping()).begin()->first;
+  int out_split_axis =
+      GetSplitAxisWithDimsMapping(out_dist_attr.dims_mapping()).begin()->first;
+
+  DenseTensor in_all_to_all = in.value();
+  // 1. preprocess, reshape and transpose the input tensor
+  if (out_split_axis != 0) {
+    // 1.1 calc the shape and reshape
+    std::vector<int64_t> pre_shape_vec = vectorize(logical_ddim);
+    pre_shape_vec[in_split_axis] /= nranks;
+    pre_shape_vec[out_split_axis] /= nranks;
+    pre_shape_vec.insert(pre_shape_vec.begin() + out_split_axis, nranks);
+
+    DenseTensor out_reshape1;
+    RESHARD_FUNCTOR(
+        dev_ctx, Reshape, dtype, in.value(), pre_shape_vec, &out_reshape1);
+
+    // 1.2 calc the the desire axis and transpose
+    std::vector<int> axis;
+    axis.emplace_back(out_split_axis);
+    for (size_t i = 0; i < pre_shape_vec.size(); ++i) {
+      if (static_cast<int>(i) != out_split_axis) {
+        axis.emplace_back(i);
+      }
+    }
+    DenseTensor out_transpose;
+    RESHARD_FUNCTOR(
+        dev_ctx, Transpose, dtype, out_reshape1, axis, &out_transpose);
+
+    // 1.3 calc the final shape and reshape
+    pre_shape_vec.erase(pre_shape_vec.begin() + out_split_axis);
+    pre_shape_vec[in_split_axis] *= nranks;
+    RESHARD_FUNCTOR(
+        dev_ctx, Reshape, dtype, out_transpose, pre_shape_vec, &in_all_to_all);
+  }
+
+  // 2. use all to all to switch data to other ranks
+  DenseTensor out_all_to_all;
+  RESHARD_FUNCTOR_WITH_COMM(dev_ctx,
+                            AllToAll,
+                            dtype,
+                            in_process_ids,
+                            in_all_to_all,
+                            GetMutableTensor(out));
+
+  // 3. postprocess, reshape and transpose the output tensor
+  if (in_split_axis != 0) {
+    // 3.1 calc the shape and reshape
+    std::vector<int64_t> post_shape_vec = vectorize(logical_ddim);
+    post_shape_vec[in_split_axis] /= nranks;
+    post_shape_vec[out_split_axis] /= nranks;
+    post_shape_vec.insert(post_shape_vec.begin(), nranks);
+
+    DenseTensor out_reshape1;
+    RESHARD_FUNCTOR(
+        dev_ctx, Reshape, dtype, out->value(), post_shape_vec, &out_reshape1);
+
+    // 3.2 calc the the desire axis and transpose
+    std::vector<int> axis;
+    for (size_t i = 1; i < post_shape_vec.size(); ++i) {
+      axis.emplace_back(i);
+    }
+    axis.insert(axis.begin() + in_split_axis, 0);
+    DenseTensor out_transpose;
+    RESHARD_FUNCTOR(
+        dev_ctx, Transpose, dtype, out_reshape1, axis, &out_transpose);
+
+    // 3.3 calc the final shape and reshape
+    post_shape_vec.erase(post_shape_vec.begin());
+    post_shape_vec[in_split_axis] *= nranks;
+    RESHARD_FUNCTOR(dev_ctx,
+                    Reshape,
+                    dtype,
+                    out_transpose,
+                    post_shape_vec,
+                    GetMutableTensor(out));
+  }
+
+  SetDistProps(out, in.dims(), out_dist_attr);
+}
+
+REGISTER_RESHARD_FUNC(SToSReshardFunction);
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/distributed/auto_parallel/reshard_function.h"
+
+namespace phi {
+namespace distributed {
+
+class SToSReshardFunction final : public ReshardFunction {
+ public:
+  SToSReshardFunction() = default;
+  ~SToSReshardFunction() = default;
+
+  bool IsSuitable(const DistTensor& in,
+                  const TensorDistAttr& out_dist_attr) override;
+
+  void Eval(DeviceContext* dev_ctx,
+            const DistTensor& in,
+            const TensorDistAttr& out_dist_attr,
+            DistTensor* out) override;
+};
+
+}  // namespace distributed
+}  // namespace phi
diff --git a/paddle/phi/kernels/all_to_all_kernel.h b/paddle/phi/kernels/all_to_all_kernel.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace phi {
 
@@ -23,4 +24,13 @@ void AllToAllKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     DenseTensor* out);
 
+template <typename T, typename Context>
+void AllToAll(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) {
+  MetaTensor out_meta(*out);
+  MetaTensor* out_meta_ptr = &out_meta;
+
+  AllToAllInferMeta(phi::MetaTensor(x), out_meta_ptr);
+  AllToAllKernel<T, Context>(dev_ctx, x, out);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/all_to_all_kernel.cc b/paddle/phi/kernels/cpu/all_to_all_kernel.cc
@@ -78,6 +78,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    bool,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    phi::dtype::float16) {}
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
@@ -90,6 +91,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    int,
                    bool,
                    int8_t,
+                   int16_t,
                    uint8_t,
                    int64_t,
                    phi::dtype::float16) {}

diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc
@@ -28,6 +28,9 @@ PD_REGISTER_KERNEL(transpose_grad,
                    double,
                    int32_t,
                    int64_t,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}

diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc
@@ -87,7 +87,9 @@ PD_REGISTER_KERNEL(transpose,
                    double,
                    int32_t,
                    int64_t,
+                   uint8_t,
                    int8_t,
+                   int16_t,
                    phi::dtype::float16,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,

diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu
@@ -94,6 +94,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    int,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    bool,
                    phi::dtype::bfloat16,
@@ -108,6 +109,7 @@ PD_REGISTER_KERNEL(all_to_all,
                    int,
                    int8_t,
                    uint8_t,
+                   int16_t,
                    int64_t,
                    bool,
                    phi::dtype::float16) {}

diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu
@@ -26,6 +26,9 @@ PD_REGISTER_KERNEL(transpose_grad,
                    bool,
                    float,
                    double,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
                    int32_t,
                    int64_t,
                    phi::dtype::float16,