From 2a2b727bbddbfaa90aa5673f09059d0ecd2b8503 Mon Sep 17 00:00:00 2001 From: LiYuRio Date: Wed, 6 Sep 2023 20:18:43 +0800 Subject: [PATCH 1/3] support reshard s to s --- paddle/fluid/pybind/auto_parallel_py.cc | 5 + .../distributed/auto_parallel/CMakeLists.txt | 3 +- .../auto_parallel/r_to_p_reshard_function.cc | 16 +- .../distributed/auto_parallel/reshard_utils.h | 25 ++++ .../auto_parallel/s_to_s_reshard_function.cc | 140 ++++++++++++++++++ .../auto_parallel/s_to_s_reshard_function.h | 37 +++++ paddle/phi/kernels/all_to_all_kernel.h | 10 ++ paddle/phi/kernels/cpu/all_to_all_kernel.cc | 2 + .../phi/kernels/cpu/transpose_grad_kernel.cc | 3 + paddle/phi/kernels/cpu/transpose_kernel.cc | 2 + paddle/phi/kernels/gpu/all_to_all_kernel.cu | 2 + .../phi/kernels/gpu/transpose_grad_kernel.cu | 3 + paddle/phi/kernels/reshape_kernel.h | 14 +- paddle/phi/kernels/transpose_kernel.h | 19 ++- test/auto_parallel/CMakeLists.txt | 3 + test/auto_parallel/reshard_s_to_s.py | 78 ++++++++++ test/auto_parallel/test_reshard_s_to_s.py | 45 ++++++ 17 files changed, 383 insertions(+), 24 deletions(-) create mode 100644 paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc create mode 100644 paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h create mode 100644 test/auto_parallel/reshard_s_to_s.py create mode 100644 test/auto_parallel/test_reshard_s_to_s.py diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc index 8cf3a4dbbab07..739de2caf1138 100644 --- a/paddle/fluid/pybind/auto_parallel_py.cc +++ b/paddle/fluid/pybind/auto_parallel_py.cc @@ -36,6 +36,7 @@ #include "paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.h" #include "paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.h" #include "paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.h" +#include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h" #ifdef PADDLE_WITH_DISTRIBUTE #include "paddle/phi/infermeta/spmd_rules/rules.h" @@ -167,6 +168,10 @@ void BindAutoParallel(py::module *m) { *m, "PToRReshardFunction", ReshardFunction) .def(py::init<>()); + py::class_( + *m, "SToSReshardFunction", ReshardFunction) + .def(py::init<>()); + py::class_(*m, "ProcessMesh") .def(py::init<>()) .def(py::init &, diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt index 632dca0f0c01a..2d8dff6adb245 100644 --- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt +++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt @@ -15,4 +15,5 @@ collect_srcs( r_to_s_reshard_function.cc s_to_r_reshard_function.cc r_to_p_reshard_function.cc - p_to_r_reshard_function.cc) + p_to_r_reshard_function.cc + s_to_s_reshard_function.cc) diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc index d8681218ae68a..bd2cb4c58a46c 100644 --- a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc +++ b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc @@ -54,20 +54,8 @@ void RToPReshardFunction::Eval(phi::DeviceContext* dev_ctx, RESHARD_FUNCTOR(dev_ctx, Full, in.dtype(), shape, 0, GetMutableTensor(out)); } else { // assign the input value to output - if (phi::CPUContext::classof(dev_ctx)) { - Assign(static_cast(*dev_ctx), - in.value(), - GetMutableTensor(out)); -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - } else if (phi::GPUContext::classof(dev_ctx)) { - Assign(static_cast(*dev_ctx), - in.value(), - GetMutableTensor(out)); -#endif - } else { - PADDLE_THROW(phi::errors::Unimplemented( - "The assign in reshard only supported on CPU and GPU for now.")); - } + RESHARD_FUNCTOR_WITHOUT_DTYPE( + dev_ctx, Assign, in.value(), GetMutableTensor(out)); } SetDistProps(out, in.dims(), out_dist_attr); } diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h index 631d5d76d35f9..61a3fbbfcad87 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h +++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h @@ -104,5 +104,30 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, RESHARD_FUNCTOR_IMPL(dev_ctx, fn_name, dtype, __VA_ARGS__); \ } while (0) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +#define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...) \ + do { \ + if (phi::CPUContext::classof(dev_ctx)) { \ + fn_name(static_cast(*dev_ctx), __VA_ARGS__); \ + } else if (phi::GPUContext::classof(dev_ctx)) { \ + fn_name(static_cast(*dev_ctx), __VA_ARGS__); \ + } else { \ + PADDLE_THROW(phi::errors::Unimplemented( \ + "The %s in reshard only supported on CPU and GPU for now.", \ + #fn_name)); \ + } \ + } while (0) +#else +#define RESHARD_FUNCTOR_WITHOUT_DTYPE(dev_ctx, fn_name, ...) \ + do { \ + if (phi::CPUContext::classof(dev_ctx)) { \ + fn_name(static_cast(*dev_ctx), __VA_ARGS__); \ + } else { \ + PADDLE_THROW(phi::errors::Unimplemented( \ + "The %s in reshard only supported on CPU for now.", #fn_name)); \ + } \ + } while (0) +#endif + } // namespace distributed } // namespace phi diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc new file mode 100644 index 0000000000000..de60a94f9452a --- /dev/null +++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h" + +#include "glog/logging.h" +#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" +#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h" +#include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h" +#include "paddle/phi/kernels/all_to_all_kernel.h" +#include "paddle/phi/kernels/reshape_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace phi { +namespace distributed { + +bool SToSReshardFunction::IsSuitable(const DistTensor& in, + const TensorDistAttr& out_dist_attr) { + bool flag = true; + const auto& in_dist_attr = in.dist_attr(); + + flag &= in_dist_attr.is_shard(); + flag &= out_dist_attr.is_shard(); + + const auto& in_process_mesh = in_dist_attr.process_mesh(); + const auto& out_process_mesh = out_dist_attr.process_mesh(); + + flag &= (in_process_mesh.ndim() == 1); + flag &= (out_process_mesh.ndim() == 1); + flag &= (in_process_mesh == out_process_mesh); + + return flag; +} + +void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx, + const DistTensor& in, + const TensorDistAttr& out_dist_attr, + DistTensor* out) { + const auto& in_process_mesh = in.dist_attr().process_mesh(); + const auto& in_process_ids = in_process_mesh.process_ids(); + auto dtype = in.dtype(); + const auto& logical_ddim = in.dims(); + int64_t nranks = in_process_ids.size(); + int64_t in_split_axis = + GetSplitAxisWithDimsMapping(in.dist_attr().dims_mapping()).begin()->first; + int64_t out_split_axis = + GetSplitAxisWithDimsMapping(out_dist_attr.dims_mapping()).begin()->first; + + DenseTensor in_all_to_all = in.value(); + // 1. preprocess, reshape and transpose the input tensor + if (out_split_axis != 0) { + // 1.1 calc the shape and reshape + std::vector pre_shape_vec = vectorize(logical_ddim); + pre_shape_vec[in_split_axis] /= nranks; + pre_shape_vec[out_split_axis] /= nranks; + pre_shape_vec.insert(pre_shape_vec.begin() + out_split_axis, nranks); + + DenseTensor out_reshape1; + RESHARD_FUNCTOR( + dev_ctx, Reshape, dtype, in.value(), pre_shape_vec, &out_reshape1); + + // 1.2 calc the the desire axis and transpose + std::vector axis; + axis.emplace_back(out_split_axis); + for (size_t i = 0; i < pre_shape_vec.size(); ++i) { + if (static_cast(i) != out_split_axis) { + axis.emplace_back(i); + } + } + DenseTensor out_transpose; + RESHARD_FUNCTOR( + dev_ctx, Transpose, dtype, out_reshape1, axis, &out_transpose); + + // 1.3 calc the final shape and reshape + pre_shape_vec.erase(pre_shape_vec.begin() + out_split_axis); + pre_shape_vec[in_split_axis] *= nranks; + RESHARD_FUNCTOR( + dev_ctx, Reshape, dtype, out_transpose, pre_shape_vec, &in_all_to_all); + } + + // 2. use all to all to switch data to other ranks + DenseTensor out_all_to_all; + RESHARD_FUNCTOR_WITH_COMM(dev_ctx, + AllToAll, + dtype, + in_process_ids, + in_all_to_all, + GetMutableTensor(out)); + + // 3. postprocess, reshape and transpose the output tensor + if (in_split_axis != 0) { + // 3.1 calc the shape and reshape + std::vector post_shape_vec = vectorize(logical_ddim); + post_shape_vec[in_split_axis] /= nranks; + post_shape_vec[out_split_axis] /= nranks; + post_shape_vec.insert(post_shape_vec.begin(), nranks); + + DenseTensor out_reshape1; + RESHARD_FUNCTOR( + dev_ctx, Reshape, dtype, out->value(), post_shape_vec, &out_reshape1); + + // 3.2 calc the the desire axis and transpose + std::vector axis; + for (size_t i = 1; i < post_shape_vec.size(); ++i) { + axis.emplace_back(i); + } + axis.insert(axis.begin() + in_split_axis, 0); + DenseTensor out_transpose; + RESHARD_FUNCTOR( + dev_ctx, Transpose, dtype, out_reshape1, axis, &out_transpose); + + // 3.3 calc the final shape and reshape + post_shape_vec.erase(post_shape_vec.begin()); + post_shape_vec[in_split_axis] *= nranks; + RESHARD_FUNCTOR(dev_ctx, + Reshape, + dtype, + out_transpose, + post_shape_vec, + GetMutableTensor(out)); + } + + SetDistProps(out, in.dims(), out_dist_attr); +} + +REGISTER_RESHARD_FUNC(SToSReshardFunction); + +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h new file mode 100644 index 0000000000000..b004ed919c192 --- /dev/null +++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h @@ -0,0 +1,37 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/distributed/auto_parallel/reshard_function.h" + +namespace phi { +namespace distributed { + +class SToSReshardFunction final : public ReshardFunction { + public: + SToSReshardFunction() = default; + ~SToSReshardFunction() = default; + + bool IsSuitable(const DistTensor& in, + const TensorDistAttr& out_dist_attr) override; + + void Eval(DeviceContext* dev_ctx, + const DistTensor& in, + const TensorDistAttr& out_dist_attr, + DistTensor* out) override; +}; + +} // namespace distributed +} // namespace phi diff --git a/paddle/phi/kernels/all_to_all_kernel.h b/paddle/phi/kernels/all_to_all_kernel.h index 3d8594b469c78..5444960b1f69e 100644 --- a/paddle/phi/kernels/all_to_all_kernel.h +++ b/paddle/phi/kernels/all_to_all_kernel.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/infermeta/unary.h" namespace phi { @@ -23,4 +24,13 @@ void AllToAllKernel(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out); +template +void AllToAll(const Context& dev_ctx, const DenseTensor& x, DenseTensor* out) { + MetaTensor out_meta(*out); + MetaTensor* out_meta_ptr = &out_meta; + + AllToAllInferMeta(phi::MetaTensor(x), out_meta_ptr); + AllToAllKernel(dev_ctx, x, out); +} + } // namespace phi diff --git a/paddle/phi/kernels/cpu/all_to_all_kernel.cc b/paddle/phi/kernels/cpu/all_to_all_kernel.cc index 5ea535c6e5e90..3407a1828e208 100644 --- a/paddle/phi/kernels/cpu/all_to_all_kernel.cc +++ b/paddle/phi/kernels/cpu/all_to_all_kernel.cc @@ -78,6 +78,7 @@ PD_REGISTER_KERNEL(all_to_all, bool, int8_t, uint8_t, + int16_t, int64_t, phi::dtype::float16) {} #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -90,6 +91,7 @@ PD_REGISTER_KERNEL(all_to_all, int, bool, int8_t, + int16_t, uint8_t, int64_t, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc index 590d227d1e518..cc3340edcb4ab 100644 --- a/paddle/phi/kernels/cpu/transpose_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/transpose_grad_kernel.cc @@ -28,6 +28,9 @@ PD_REGISTER_KERNEL(transpose_grad, double, int32_t, int64_t, + uint8_t, + int8_t, + int16_t, phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/transpose_kernel.cc b/paddle/phi/kernels/cpu/transpose_kernel.cc index 73359479770fd..bab9d47caa9aa 100644 --- a/paddle/phi/kernels/cpu/transpose_kernel.cc +++ b/paddle/phi/kernels/cpu/transpose_kernel.cc @@ -87,7 +87,9 @@ PD_REGISTER_KERNEL(transpose, double, int32_t, int64_t, + uint8_t, int8_t, + int16_t, phi::dtype::float16, phi::dtype::bfloat16, phi::dtype::complex, diff --git a/paddle/phi/kernels/gpu/all_to_all_kernel.cu b/paddle/phi/kernels/gpu/all_to_all_kernel.cu index 3e1008c408179..6d50e2ceb1ae5 100644 --- a/paddle/phi/kernels/gpu/all_to_all_kernel.cu +++ b/paddle/phi/kernels/gpu/all_to_all_kernel.cu @@ -94,6 +94,7 @@ PD_REGISTER_KERNEL(all_to_all, int, int8_t, uint8_t, + int16_t, int64_t, bool, phi::dtype::bfloat16, @@ -108,6 +109,7 @@ PD_REGISTER_KERNEL(all_to_all, int, int8_t, uint8_t, + int16_t, int64_t, bool, phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu index 35215ed5b96ed..c90d0bc40875b 100644 --- a/paddle/phi/kernels/gpu/transpose_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/transpose_grad_kernel.cu @@ -26,6 +26,9 @@ PD_REGISTER_KERNEL(transpose_grad, bool, float, double, + uint8_t, + int8_t, + int16_t, int32_t, int64_t, phi::dtype::float16, diff --git a/paddle/phi/kernels/reshape_kernel.h b/paddle/phi/kernels/reshape_kernel.h index da8679cc39980..972d72ad706d9 100644 --- a/paddle/phi/kernels/reshape_kernel.h +++ b/paddle/phi/kernels/reshape_kernel.h @@ -41,14 +41,22 @@ void ReshapeStridedKernel(const Context& dev_ctx, DenseTensor* out, DenseTensor* xshape); +template +void Reshape(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& shape, + DenseTensor* out) { + MetaTensor meta_out(out); + InferMetaFromVecValue(x, shape, &meta_out); + ReshapeInferKernel(dev_ctx, x, IntArray(shape), out); +} + template DenseTensor Reshape(const Context& dev_ctx, const DenseTensor& x, const std::vector& shape) { DenseTensor dense_out; - MetaTensor meta_out(&dense_out); - InferMetaFromVecValue(x, shape, &meta_out); - ReshapeInferKernel(dev_ctx, x, IntArray(shape), &dense_out); + Reshape(dev_ctx, x, shape, &dense_out); return dense_out; } diff --git a/paddle/phi/kernels/transpose_kernel.h b/paddle/phi/kernels/transpose_kernel.h index 2c576a85710cf..5555586c04387 100644 --- a/paddle/phi/kernels/transpose_kernel.h +++ b/paddle/phi/kernels/transpose_kernel.h @@ -34,17 +34,24 @@ void TransposeStridedKernel(const Context& dev_ctx, DenseTensor* out); template -DenseTensor Transpose(const Context& dev_ctx, - const DenseTensor& x, - const std::vector& axis) { - DenseTensor dense_out; - MetaTensor meta_out(&dense_out); +void Transpose(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* dense_out) { + MetaTensor meta_out(dense_out); TransposeInferMeta(x, axis, &meta_out); // do not call TransposeStridedKernel, because some other kernels call // Transpose directly - TransposeKernel(dev_ctx, x, axis, &dense_out); + TransposeKernel(dev_ctx, x, axis, dense_out); +} +template +DenseTensor Transpose(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis) { + DenseTensor dense_out; + Transpose(dev_ctx, x, axis, &dense_out); return dense_out; } diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt index 16ac0758ae96a..1f5f9177fc63f 100644 --- a/test/auto_parallel/CMakeLists.txt +++ b/test/auto_parallel/CMakeLists.txt @@ -82,6 +82,9 @@ if(WITH_DISTRIBUTE AND WITH_GPU) py_test_modules(test_reshard_s_to_r MODULES test_reshard_s_to_r) set_tests_properties(test_reshard_s_to_r PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100) + py_test_modules(test_reshard_s_to_s MODULES test_reshard_s_to_s) + set_tests_properties(test_reshard_s_to_s + PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100) py_test_modules(test_reshard_r_to_s MODULES test_reshard_r_to_s) set_tests_properties(test_reshard_r_to_s PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100) diff --git a/test/auto_parallel/reshard_s_to_s.py b/test/auto_parallel/reshard_s_to_s.py new file mode 100644 index 0000000000000..dfdea856ab8fe --- /dev/null +++ b/test/auto_parallel/reshard_s_to_s.py @@ -0,0 +1,78 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np + +import paddle +import paddle.distributed as dist +from paddle.framework import core + + +class TestReshardSToS: + def __init__(self): + self._shape = eval(os.getenv("shape")) + self._dtype = os.getenv("dtype") + self._seeds = eval(os.getenv("seeds")) + self._backend = os.getenv("backend") + self._mesh = dist.ProcessMesh([0, 1], dim_names=["x"]) + + def test_body(self, in_shard, out_shard): + if self._backend == "cpu": + paddle.set_device("cpu") + place = paddle.CPUPlace() + elif self._backend == "gpu": + place = paddle.CUDAPlace(dist.get_rank()) + + dev_ctx = core.DeviceContext.create(place) + a = paddle.ones(self._shape) + in_shard_specs = [None for i in range(len(self._shape))] + in_shard_specs[in_shard] = "x" + out_shard_specs = [None for i in range(len(self._shape))] + out_shard_specs[out_shard] = "x" + + dist_attr = dist.DistAttr( + mesh=self._mesh, sharding_specs=in_shard_specs + ) + out_dist_attr = dist.DistAttr( + mesh=self._mesh, sharding_specs=out_shard_specs + ) + + input_tensor = dist.shard_tensor(a, dist_attr=dist_attr) + + reshard_func = core.SToSReshardFunction() + assert reshard_func.is_suitable(input_tensor, out_dist_attr) + + out = reshard_func.eval(dev_ctx, input_tensor, out_dist_attr) + out_shape = list(self._shape) + + out_shape[out_shard] = out_shape[out_shard] // 2 + + assert np.equal(out.shape, input_tensor.shape).all() + assert np.equal(out._local_shape, out_shape).all() + + def test_case1(self): + self.test_body(0, len(self._shape) - 1) + + def test_case2(self): + self.test_body(len(self._shape) - 1, 0) + + def run_test_case(self): + self.test_case1() + self.test_case2() + + +if __name__ == '__main__': + TestReshardSToS().run_test_case() diff --git a/test/auto_parallel/test_reshard_s_to_s.py b/test/auto_parallel/test_reshard_s_to_s.py new file mode 100644 index 0000000000000..ab3f470a6568e --- /dev/null +++ b/test/auto_parallel/test_reshard_s_to_s.py @@ -0,0 +1,45 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import collective.test_communication_api_base as test_base + + +class TestReshardSToS(test_base.CommunicationTestDistBase): + def setUp(self): + super().setUp(num_of_devices=2, timeout=120) + self._default_envs = { + "shape": "(6, 20)", + "dtype": "float32", + "seeds": str(self._seeds), + "backend": "gpu", + } + self._changeable_envs = { + "shape": ["(6, 20)", "(6, 20, 10)"], + } + + def test_reshard_s_to_s(self): + envs_list = test_base.gen_product_envs_list( + self._default_envs, self._changeable_envs + ) + for envs in envs_list: + self.run_test_case( + "reshard_s_to_s.py", + user_defined_envs=envs, + ) + + +if __name__ == "__main__": + unittest.main() From 544b0397446b781e538fb64eed73a8cb1298fde6 Mon Sep 17 00:00:00 2001 From: LiYuRio Date: Thu, 14 Sep 2023 11:07:54 +0800 Subject: [PATCH 2/3] refine, remove useless code --- .../distributed/auto_parallel/r_to_s_reshard_function.cc | 6 +++--- .../phi/core/distributed/auto_parallel/reshard_utils.cc | 4 ++-- paddle/phi/core/distributed/auto_parallel/reshard_utils.h | 2 +- .../distributed/auto_parallel/s_to_r_reshard_function.cc | 8 ++------ .../distributed/auto_parallel/s_to_s_reshard_function.cc | 7 +++---- 5 files changed, 11 insertions(+), 16 deletions(-) diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc index a3c616f5e8b35..381f77991f72d 100644 --- a/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc +++ b/paddle/phi/core/distributed/auto_parallel/r_to_s_reshard_function.cc @@ -51,11 +51,11 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx, DenseTensor out_physical_tensor_cur_rank; - std::map split_axis_to_mesh_axis = + std::map split_axis_to_mesh_axis = GetSplitAxisWithDimsMapping(out_dims_mapping); std::vector coord_in_mesh = GetCurRankCoordInMesh(out_process_mesh); - int64_t split_axis = split_axis_to_mesh_axis.begin()->first; + int split_axis = split_axis_to_mesh_axis.begin()->first; int64_t mesh_axis = split_axis_to_mesh_axis.begin()->second; int64_t num_of_process = out_process_mesh.shape()[mesh_axis]; @@ -65,7 +65,7 @@ void RToSReshardFunction::Eval(phi::DeviceContext* dev_ctx, << " process participate in."; std::vector split_num_vec = - BalancedSplit(in.dims()[static_cast(split_axis)], num_of_process); + BalancedSplit(in.dims()[split_axis], num_of_process); IntArray sections(split_num_vec); std::vector split_out_vec; diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc index 1d9677a0a2bc5..2767dfa836394 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc +++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.cc @@ -112,9 +112,9 @@ CommContext* CreateOrGetCommContext(const DeviceContext& dev_ctx, return comm_context; } -std::map GetSplitAxisWithDimsMapping( +std::map GetSplitAxisWithDimsMapping( const std::vector& dims_mapping) { - std::map split_axis_to_mesh_axis; + std::map split_axis_to_mesh_axis; for (size_t i = 0; i < dims_mapping.size(); ++i) { if (dims_mapping[i] != -1) { split_axis_to_mesh_axis.emplace(i, dims_mapping[i]); diff --git a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h index 61a3fbbfcad87..b947c70bb5bc9 100644 --- a/paddle/phi/core/distributed/auto_parallel/reshard_utils.h +++ b/paddle/phi/core/distributed/auto_parallel/reshard_utils.h @@ -40,7 +40,7 @@ std::vector GetCurRankCoordInMesh(const ProcessMesh& process_mesh); // input vector, return a key-value map of tensor_split_axis and // process_mesh_split_axis. // For example, if dims_mapping is [-1, 1, -1, 0], will return {1: 1, 3: 0}. -std::map GetSplitAxisWithDimsMapping( +std::map GetSplitAxisWithDimsMapping( const std::vector& dims_mapping); // If given a number, balance split it to multiple pieces. diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc index 20adea5ea90bb..efa5035c495ed 100644 --- a/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc +++ b/paddle/phi/core/distributed/auto_parallel/s_to_r_reshard_function.cc @@ -44,9 +44,7 @@ bool SToRReshardFunction::IsSuitable(const DistTensor& in, // Ensure the tensor is balanced split, or we need send/recv rather than // all_gather - std::map split_axis_to_mesh_axis = - GetSplitAxisWithDimsMapping(in_dims_mapping); - int64_t split_axis = split_axis_to_mesh_axis.begin()->first; + int split_axis = GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first; int64_t num_of_process = in_process_mesh.size(); flag &= (in.local_dims()[static_cast(split_axis)] * num_of_process == in.dims()[static_cast(split_axis)]); @@ -74,9 +72,7 @@ void SToRReshardFunction::Eval(DeviceContext* dev_ctx, in.value(), in_process_ids.size(), GetMutableTensor(out)); - std::map split_axis_to_mesh_axis = - GetSplitAxisWithDimsMapping(in_dims_mapping); - int64_t split_axis = split_axis_to_mesh_axis.begin()->first; + int split_axis = GetSplitAxisWithDimsMapping(in_dims_mapping).begin()->first; if (split_axis == 0) { // If the input dist tensor is shard(0), the subsequent split diff --git a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc index de60a94f9452a..45ec290973446 100644 --- a/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc +++ b/paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.cc @@ -14,7 +14,6 @@ #include "paddle/phi/core/distributed/auto_parallel/s_to_s_reshard_function.h" -#include "glog/logging.h" #include "paddle/phi/core/distributed/auto_parallel/dist_attr.h" #include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h" #include "paddle/phi/core/distributed/auto_parallel/reshard_utils.h" @@ -52,9 +51,9 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx, auto dtype = in.dtype(); const auto& logical_ddim = in.dims(); int64_t nranks = in_process_ids.size(); - int64_t in_split_axis = + int in_split_axis = GetSplitAxisWithDimsMapping(in.dist_attr().dims_mapping()).begin()->first; - int64_t out_split_axis = + int out_split_axis = GetSplitAxisWithDimsMapping(out_dist_attr.dims_mapping()).begin()->first; DenseTensor in_all_to_all = in.value(); @@ -74,7 +73,7 @@ void SToSReshardFunction::Eval(phi::DeviceContext* dev_ctx, std::vector axis; axis.emplace_back(out_split_axis); for (size_t i = 0; i < pre_shape_vec.size(); ++i) { - if (static_cast(i) != out_split_axis) { + if (static_cast(i) != out_split_axis) { axis.emplace_back(i); } } From 75df74b90e5be22a6c9eead2cf46a39f959f3725 Mon Sep 17 00:00:00 2001 From: LiYuRio Date: Thu, 14 Sep 2023 14:26:21 +0800 Subject: [PATCH 3/3] reduce changable file --- .../auto_parallel/r_to_p_reshard_function.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc index bd2cb4c58a46c..d8681218ae68a 100644 --- a/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc +++ b/paddle/phi/core/distributed/auto_parallel/r_to_p_reshard_function.cc @@ -54,8 +54,20 @@ void RToPReshardFunction::Eval(phi::DeviceContext* dev_ctx, RESHARD_FUNCTOR(dev_ctx, Full, in.dtype(), shape, 0, GetMutableTensor(out)); } else { // assign the input value to output - RESHARD_FUNCTOR_WITHOUT_DTYPE( - dev_ctx, Assign, in.value(), GetMutableTensor(out)); + if (phi::CPUContext::classof(dev_ctx)) { + Assign(static_cast(*dev_ctx), + in.value(), + GetMutableTensor(out)); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + } else if (phi::GPUContext::classof(dev_ctx)) { + Assign(static_cast(*dev_ctx), + in.value(), + GetMutableTensor(out)); +#endif + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "The assign in reshard only supported on CPU and GPU for now.")); + } } SetDistProps(out, in.dims(), out_dist_attr); }