PaddlePaddle · wanghuancoder · Nov 8, 2023 · Nov 8, 2023 · Nov 8, 2023
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.cc b/paddle/fluid/eager/pylayer/py_layer_node.cc
@@ -62,35 +62,80 @@ GradNodePyLayer::operator()(
       PyObject* pylist = PyList_New((Py_ssize_t)grads[i].size());
       for (size_t j = 0; j < grads[i].size(); j++) {
         if (ctx->materialize_grads && !grads[i][j].initialized()) {
-          paddle::Tensor tensor_tmp;
-          auto dense_tensor = std::make_shared<phi::DenseTensor>();
-          dense_tensor->set_meta(forward_outputs_meta_[i][j]);
-          tensor_tmp.set_impl(dense_tensor);
-          PyList_SET_ITEM(
-              pylist,
-              static_cast<Py_ssize_t>(i),
-              paddle::pybind::ToPyObject(paddle::experimental::zeros_like(
-                  tensor_tmp,
-                  tensor_tmp.dtype(),
-                  forward_outputs_place_[i][j])));
+          if (forward_outputs_is_dist_meta_[i][j]) {
+            paddle::Tensor dist_tensor;
+            dist_tensor.set_impl(std::make_shared<phi::distributed::DistTensor>(
+                forward_outputs_global_dims_[i][j],
+                forward_outputs_dist_attr_[i][j]));
+            if (forward_outputs_meta_[i][j].dims.size() != -1) {
+              paddle::Tensor tensor_tmp;
+              auto dense_tensor = std::make_shared<phi::DenseTensor>();
+              dense_tensor->set_meta(forward_outputs_meta_[i][j]);
+              tensor_tmp.set_impl(dense_tensor);
+              auto zero_tensor = paddle::experimental::zeros_like(
+                  tensor_tmp, tensor_tmp.dtype(), forward_outputs_place_[i][j]);
+              *(static_cast<phi::distributed::DistTensor*>(
+                    dist_tensor.impl().get())
+                    ->unsafe_mutable_value()) =
+                  *(static_cast<phi::DenseTensor*>(zero_tensor.impl().get()));
+            }
+            PyTuple_SET_ITEM(pylist,
+                             static_cast<Py_ssize_t>(j),
+                             paddle::pybind::ToPyObject(dist_tensor));
+          } else {
+            paddle::Tensor tensor_tmp;
+            auto dense_tensor = std::make_shared<phi::DenseTensor>();
+            dense_tensor->set_meta(forward_outputs_meta_[i][j]);
+            tensor_tmp.set_impl(dense_tensor);
+            PyTuple_SET_ITEM(
+                pylist,
+                static_cast<Py_ssize_t>(j),
+                paddle::pybind::ToPyObject(paddle::experimental::zeros_like(
+                    tensor_tmp,
+                    tensor_tmp.dtype(),
+                    forward_outputs_place_[i][j])));
+          }
         } else {
           PyList_SET_ITEM(pylist,
-                          static_cast<Py_ssize_t>(i),
+                          static_cast<Py_ssize_t>(0),
                           paddle::pybind::ToPyObject(grads[i][0], true));
         }
       }
       PyTuple_SET_ITEM(backward_args, i, pylist);
     } else {
       if (ctx->materialize_grads && !grads[i][0].initialized()) {
-        paddle::Tensor tensor_tmp;
-        auto dense_tensor = std::make_shared<phi::DenseTensor>();
-        dense_tensor->set_meta(forward_outputs_meta_[i][0]);
-        tensor_tmp.set_impl(dense_tensor);
-        PyTuple_SET_ITEM(
-            backward_args,
-            i,
-            paddle::pybind::ToPyObject(paddle::experimental::zeros_like(
-                tensor_tmp, tensor_tmp.dtype(), forward_outputs_place_[i][0])));
+        if (forward_outputs_is_dist_meta_[i][0]) {
+          paddle::Tensor dist_tensor;
+          dist_tensor.set_impl(std::make_shared<phi::distributed::DistTensor>(
+              forward_outputs_global_dims_[i][0],
+              forward_outputs_dist_attr_[i][0]));
+          if (forward_outputs_meta_[i][0].dims.size() != -1) {
+            paddle::Tensor tensor_tmp;
+            auto dense_tensor = std::make_shared<phi::DenseTensor>();
+            dense_tensor->set_meta(forward_outputs_meta_[i][0]);
+            tensor_tmp.set_impl(dense_tensor);
+            auto zero_tensor = paddle::experimental::zeros_like(
+                tensor_tmp, tensor_tmp.dtype(), forward_outputs_place_[i][0]);
+            *(static_cast<phi::distributed::DistTensor*>(
+                  dist_tensor.impl().get())
+                  ->unsafe_mutable_value()) =
+                *(static_cast<phi::DenseTensor*>(zero_tensor.impl().get()));
+          }
+          PyTuple_SET_ITEM(
+              backward_args, i, paddle::pybind::ToPyObject(dist_tensor));
+        } else {
+          paddle::Tensor tensor_tmp;
+          auto dense_tensor = std::make_shared<phi::DenseTensor>();
+          dense_tensor->set_meta(forward_outputs_meta_[i][0]);
+          tensor_tmp.set_impl(dense_tensor);
+          PyTuple_SET_ITEM(
+              backward_args,
+              i,
+              paddle::pybind::ToPyObject(paddle::experimental::zeros_like(
+                  tensor_tmp,
+                  tensor_tmp.dtype(),
+                  forward_outputs_place_[i][0])));
+        }
       } else {
         PyTuple_SET_ITEM(
             backward_args, i, paddle::pybind::ToPyObject(grads[i][0], true));

diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -24,6 +24,7 @@
 #include "paddle/fluid/eager/grad_node_info.h"
 #include "paddle/fluid/eager/hooks.h"
 #include "paddle/phi/core/compat/convert_utils.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_tensor.h"
 #include "paddle/phi/core/tensor_meta.h"
 
 namespace egr {
@@ -63,15 +64,35 @@ class GradNodePyLayer : public GradNodeBase {
       const std::vector<std::vector<paddle::Tensor*>>& outputs_tensor) {
     forward_outputs_meta_.resize(outputs_tensor.size());
     forward_outputs_place_.resize(outputs_tensor.size());
+    forward_outputs_dist_attr_.resize(outputs_tensor.size());
+    forward_outputs_global_dims_.resize(outputs_tensor.size());
+    forward_outputs_is_dist_meta_.resize(outputs_tensor.size());
     for (size_t i = 0; i < outputs_tensor.size(); i++) {
       forward_outputs_meta_[i].reserve(outputs_tensor[i].size());
       forward_outputs_place_[i].reserve(outputs_tensor[i].size());
+      forward_outputs_dist_attr_[i].reserve(outputs_tensor[i].size());
+      forward_outputs_global_dims_[i].reserve(outputs_tensor[i].size());
+      forward_outputs_is_dist_meta_[i].reserve(outputs_tensor[i].size());
       for (auto tensor : outputs_tensor[i]) {
         if (tensor->is_dense_tensor()) {
           forward_outputs_meta_[i].push_back(
               static_cast<phi::DenseTensor*>(tensor->impl().get())->meta());
+          forward_outputs_is_dist_meta_[i].push_back(false);
+        } else if (tensor->is_dist_tensor()) {
+          forward_outputs_meta_[i].push_back(
+              static_cast<phi::distributed::DistTensor*>(tensor->impl().get())
+                  ->value()
+                  .meta());
+          forward_outputs_dist_attr_[i].push_back(
+              static_cast<phi::distributed::DistTensor*>(tensor->impl().get())
+                  ->dist_attr());
+          forward_outputs_global_dims_[i].push_back(
+              static_cast<phi::distributed::DistTensor*>(tensor->impl().get())
+                  ->dims());
+          forward_outputs_is_dist_meta_[i].push_back(true);
         } else {
           forward_outputs_meta_[i].emplace_back();
+          forward_outputs_is_dist_meta_[i].push_back(false);
         }
         forward_outputs_place_[i].emplace_back(tensor->place());
       }
@@ -89,6 +110,10 @@ class GradNodePyLayer : public GradNodeBase {
   std::string name_{""};
   std::vector<std::vector<phi::DenseTensorMeta>> forward_outputs_meta_;
   std::vector<std::vector<paddle::platform::Place>> forward_outputs_place_;
+  std::vector<std::vector<phi::distributed::TensorDistAttr>>
+      forward_outputs_dist_attr_;
+  std::vector<std::vector<phi::DDim>> forward_outputs_global_dims_;
+  std::vector<std::vector<bool>> forward_outputs_is_dist_meta_;
 };
 
 }  // namespace egr
diff --git a/test/auto_parallel/CMakeLists.txt b/test/auto_parallel/CMakeLists.txt
@@ -119,6 +119,10 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
                   test_semi_auto_parallel_basic)
   set_tests_properties(test_semi_auto_parallel_basic
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 200)
+  py_test_modules(test_semi_auto_parallel_pylayer MODULES
+                  test_semi_auto_parallel_pylayer)
+  set_tests_properties(test_semi_auto_parallel_pylayer
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 100)
   py_test_modules(test_semi_auto_parallel_single_strategy MODULES
                   test_semi_auto_parallel_single_strategy)
   set_tests_properties(test_semi_auto_parallel_single_strategy

diff --git a/test/auto_parallel/semi_auto_parallel_pylayer.py b/test/auto_parallel/semi_auto_parallel_pylayer.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.distributed as dist
+from paddle.autograd.py_layer import PyLayer
+
+
+class TestNet(PyLayer):
+    @staticmethod
+    def forward(ctx, x1, x2, x3):
+        y1 = paddle.matmul(x1, x2, transpose_x=False, transpose_y=False)
+        y2 = paddle.matmul(x2, x3, transpose_x=False, transpose_y=False)
+        return y1, y2
+
+    @staticmethod
+    def backward(ctx, dy1, dy2):
+        return dy1, dy2, dy2
+
+
+class TestPyLayerForSemiAutoParallel(unittest.TestCase):
+    def run_test_case(self):
+        mesh = dist.ProcessMesh([0, 1], dim_names=["x"])
+
+        x1_np = np.random.random(size=[64, 32]).astype(np.float32)
+        x2_np = np.random.random(size=[32, 48]).astype(np.float32)
+        x3_np = np.random.random(size=[48, 64]).astype(np.float32)
+        x1 = paddle.to_tensor(x1_np)
+        x2 = paddle.to_tensor(x2_np)
+        x3 = paddle.to_tensor(x3_np)
+        x1.stop_gradient = False
+        x2.stop_gradient = False
+        x3.stop_gradient = False
+
+        x1_dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=[None, None])
+        x2_dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=[None, None])
+        x3_dist_attr = dist.DistAttr(mesh=mesh, sharding_specs=[None, None])
+
+        dist_x1 = dist.shard_tensor(x1_np, dist_attr=x1_dist_attr)
+        dist_x2 = dist.shard_tensor(x2_np, dist_attr=x2_dist_attr)
+        dist_x3 = dist.shard_tensor(x3_np, dist_attr=x3_dist_attr)
+        dist_x1.stop_gradient = False
+        dist_x2.stop_gradient = False
+        dist_x3.stop_gradient = False
+
+        y1, y2 = TestNet.apply(x1, x2, x3)
+        loss = y1.sum()
+
+        dist_y1, dist_y2 = TestNet.apply(dist_x1, dist_x2, dist_x3)
+        dist_loss = dist_y1.sum()
+
+        np.testing.assert_allclose(
+            loss.numpy(), dist_loss.numpy(), rtol=1e-04, verbose=True
+        )
+
+        loss.backward()
+        dist_loss.backward()
+
+        np.testing.assert_allclose(
+            x1.grad.numpy(), dist_x1.grad.numpy(), rtol=1e-04, verbose=True
+        )
+        np.testing.assert_allclose(
+            x2.grad.numpy(), dist_x2.grad.numpy(), rtol=1e-04, verbose=True
+        )
+        np.testing.assert_allclose(
+            x3.grad.numpy(), dist_x3.grad.numpy(), rtol=1e-04, verbose=True
+        )
+
+
+if __name__ == '__main__':
+    TestPyLayerForSemiAutoParallel().run_test_case()
diff --git a/test/auto_parallel/test_semi_auto_parallel_pylayer.py b/test/auto_parallel/test_semi_auto_parallel_pylayer.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import collective.test_communication_api_base as test_base
+
+
+class TestSemiAutoParallelPyLayer(test_base.CommunicationTestDistBase):
+    def setUp(self):
+        super().setUp(
+            num_of_devices=2,
+            timeout=120,
+        )
+        self._default_envs = {
+            "dtype": "float32",
+            "seed": "2023",
+        }
+        self._changeable_envs = {"backend": ["cpu", "gpu"]}
+
+    def test_pylayer(self):
+        envs_list = test_base.gen_product_envs_list(
+            self._default_envs, self._changeable_envs
+        )
+        for envs in envs_list:
+            self.run_test_case(
+                "semi_auto_parallel_pylayer.py",
+                user_defined_envs=envs,
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()