From e8a7d3ee77bd1fa871eea9751756ce6a5aac1ed4 Mon Sep 17 00:00:00 2001
From: veyron95 <veyron_wu@163.com>
Date: Mon, 1 Nov 2021 12:47:48 +0000
Subject: [PATCH 1/7] Expose func for varbase

---
 paddle/fluid/imperative/layer.cc  | 64 ++++++++++++++++++++++++++
 paddle/fluid/imperative/layer.h   |  4 ++
 paddle/fluid/pybind/imperative.cc | 75 +++++++++++++++++++++++++++++++
 3 files changed, 143 insertions(+)
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 53ae5b8127fdba..3ca2133657a68c 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -356,6 +356,70 @@ void VarBase::BumpInplaceVersion() {
   MutableVar()->BumpInplaceVersion();
 }
 
+std::shared_ptr<VarBase> VarBase::To(const platform::Place& dst_place,
+                                     framework::proto::VarType::Type data_type,
+                                     const bool blocking) const {
+  PADDLE_ENFORCE_EQ(
+      Var().IsInitialized() && (Var().IsType<framework::LoDTensor>() ||
+                                Var().IsType<framework::SelectedRows>()),
+      true, platform::errors::InvalidArgument(
+                "Variable is not initialized or Variable's type is not "
+                "LoDTensor or SelectedRows when getting numpy tensor"));
+
+  if (Var().IsType<framework::LoDTensor>()) {
+    auto& src_tensor = Var().Get<framework::LoDTensor>();
+    // TODO(Jiabin): change this after move unique_name generator to CXX
+    auto new_var = std::make_shared<VarBase>(
+        true, Name() + std::to_string(copied_counter_++));
+
+    new_var->SetPersistable(Persistable());
+    new_var->SetDataType(data_type);
+    new_var->SetType(Type());
+    auto* dst_tensor =
+        new_var->MutableVar()->GetMutable<framework::LoDTensor>();
+    dst_tensor->set_lod(src_tensor.lod());
+    framework::TensorCopy(src_tensor, dst_place, dst_tensor);
+    if (blocking) {
+      platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
+      auto src_place = src_tensor.place();
+      if (!(src_place == dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src_place)->Wait();
+      }
+    }
+    VLOG(4) << "copy tensor " << Name() << " from " << Place() << " to "
+            << dst_place;
+    VLOG(4) << "copy tensor " << Name() << " from " << DataType() << " to "
+            << data_type;
+
+    return new_var;
+  } else {
+    auto& src_selected_rows = Var().Get<framework::SelectedRows>();
+    auto new_var = std::make_shared<VarBase>(
+        false, "Itmp" + std::to_string(copied_counter_++));
+    new_var->SetType(framework::proto::VarType::SELECTED_ROWS);
+    new_var->SetDataType(data_type);
+    auto* dst_selected_rows =
+        new_var->MutableVar()->GetMutable<framework::SelectedRows>();
+
+    framework::TensorCopy(src_selected_rows.value(), dst_place,
+                          dst_selected_rows->mutable_value());
+    if (blocking) {
+      platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
+      auto src_place = src_selected_rows.place();
+      if (!(src_place == dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src_place)->Wait();
+      }
+    }
+    dst_selected_rows->set_height(src_selected_rows.height());
+    dst_selected_rows->set_rows(src_selected_rows.rows());
+    VLOG(4) << "copy tensor " << Name() << " from " << Place() << " to "
+            << dst_place;
+    VLOG(4) << "copy tensor " << Name() << " from " << DataType() << " to "
+            << data_type;
+    return new_var;
+  }
+}
+
 void OpBase::SetType(const std::string& type) {
   op_ = framework::OpRegistry::CreateOp(type, {}, {}, {}, false);
 }
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 16580627ed1964..3443a61e659d5d 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -252,6 +252,10 @@ class VarBase {
         std::forward<std::shared_ptr<std::function<void()>>>(hook));
   }
 
+  std::shared_ptr<VarBase> To(const platform::Place& dst_place,
+                              framework::proto::VarType::Type data_type,
+                              const bool blocking) const;
+
  private:
   /**
    * NOTE(zengjinle): never remove the const qualifier of `var_` if you are
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 4403eb469723a5..f8a08fde47f269 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1865,6 +1865,81 @@ void BindImperative(py::module *m_ptr) {
            py::return_value_policy::copy)
       .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
            py::return_value_policy::reference)
+      .def("_clear",
+           [](const std::shared_ptr<imperative::VarBase> &self) {
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             t->clear();
+           })
+      .def("_offset",
+           [](const std::shared_ptr<imperative::VarBase> &self) {
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             return t->offset();
+           })
+      .def("_share_buffer_with",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              std::shared_ptr<imperative::VarBase> &target_t) {
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             auto *t_t =
+                 target_t->MutableVar()->GetMutable<framework::LoDTensor>();
+             PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             PADDLE_ENFORCE_EQ(t_t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             t->ShareBufferWith(*t_t);
+           })
+      .def("_is_shared_buffer_with",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              std::shared_ptr<imperative::VarBase> &target_t) {
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             auto *t_t =
+                 target_t->MutableVar()->GetMutable<framework::LoDTensor>();
+             PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             PADDLE_ENFORCE_EQ(t_t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             return t->IsSharedBufferWith(*t_t);
+           })
+      .def("_Slice",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              int64_t begin_idx, int64_t end_idx) {
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             return t->Slice(begin_idx, end_idx);
+           })
+      .def("_To",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::Place &place,
+              framework::proto::VarType::Type data_type, bool blocking) {
+             auto new_var = self->To(place, data_type, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
+           py::return_value_policy::copy)
+      .def("_To",
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::CPUPlace &place,
+              framework::proto::VarType::Type data_type, bool blocking) {
+             auto new_var = self->To(place, data_type, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
+           py::return_value_policy::copy)
       .def_property("name", &imperative::VarBase::Name,
                     &imperative::VarBase::SetName)
       .def_property("stop_gradient",

From 05268199ee0ff4da72365ca924b45f36005c74c6 Mon Sep 17 00:00:00 2001
From: veyron95 <veyron_wu@163.com>
Date: Wed, 3 Nov 2021 07:36:04 +0000
Subject: [PATCH 2/7] Expose func for varbase and enhance varbase init func

---
 paddle/fluid/imperative/layer.cc              |  85 +++++----------
 paddle/fluid/imperative/layer.h               |   6 +-
 paddle/fluid/pybind/imperative.cc             |  66 ++++++++----
 .../fluid/dygraph/varbase_patch_methods.py    |  45 +++++++-
 .../fluid/tests/unittests/test_var_base.py    | 100 ++++++++++++++++++
 5 files changed, 213 insertions(+), 89 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 3ca2133657a68c..d24af5dfef826c 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -356,67 +356,30 @@ void VarBase::BumpInplaceVersion() {
   MutableVar()->BumpInplaceVersion();
 }
 
-std::shared_ptr<VarBase> VarBase::To(const platform::Place& dst_place,
-                                     framework::proto::VarType::Type data_type,
-                                     const bool blocking) const {
-  PADDLE_ENFORCE_EQ(
-      Var().IsInitialized() && (Var().IsType<framework::LoDTensor>() ||
-                                Var().IsType<framework::SelectedRows>()),
-      true, platform::errors::InvalidArgument(
-                "Variable is not initialized or Variable's type is not "
-                "LoDTensor or SelectedRows when getting numpy tensor"));
-
-  if (Var().IsType<framework::LoDTensor>()) {
-    auto& src_tensor = Var().Get<framework::LoDTensor>();
-    // TODO(Jiabin): change this after move unique_name generator to CXX
-    auto new_var = std::make_shared<VarBase>(
-        true, Name() + std::to_string(copied_counter_++));
-
-    new_var->SetPersistable(Persistable());
-    new_var->SetDataType(data_type);
-    new_var->SetType(Type());
-    auto* dst_tensor =
-        new_var->MutableVar()->GetMutable<framework::LoDTensor>();
-    dst_tensor->set_lod(src_tensor.lod());
-    framework::TensorCopy(src_tensor, dst_place, dst_tensor);
-    if (blocking) {
-      platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
-      auto src_place = src_tensor.place();
-      if (!(src_place == dst_place)) {
-        platform::DeviceContextPool::Instance().Get(src_place)->Wait();
-      }
-    }
-    VLOG(4) << "copy tensor " << Name() << " from " << Place() << " to "
-            << dst_place;
-    VLOG(4) << "copy tensor " << Name() << " from " << DataType() << " to "
-            << data_type;
-
-    return new_var;
-  } else {
-    auto& src_selected_rows = Var().Get<framework::SelectedRows>();
-    auto new_var = std::make_shared<VarBase>(
-        false, "Itmp" + std::to_string(copied_counter_++));
-    new_var->SetType(framework::proto::VarType::SELECTED_ROWS);
-    new_var->SetDataType(data_type);
-    auto* dst_selected_rows =
-        new_var->MutableVar()->GetMutable<framework::SelectedRows>();
-
-    framework::TensorCopy(src_selected_rows.value(), dst_place,
-                          dst_selected_rows->mutable_value());
-    if (blocking) {
-      platform::DeviceContextPool::Instance().Get(dst_place)->Wait();
-      auto src_place = src_selected_rows.place();
-      if (!(src_place == dst_place)) {
-        platform::DeviceContextPool::Instance().Get(src_place)->Wait();
-      }
-    }
-    dst_selected_rows->set_height(src_selected_rows.height());
-    dst_selected_rows->set_rows(src_selected_rows.rows());
-    VLOG(4) << "copy tensor " << Name() << " from " << Place() << " to "
-            << dst_place;
-    VLOG(4) << "copy tensor " << Name() << " from " << DataType() << " to "
-            << data_type;
-    return new_var;
+// NOTE(weilong wu): This func try to share grad_var_ data with target varbase
+void VarBase::_ShareDataWith(const VarBase& src) {
+  if (Var().IsInitialized()) {
+    PADDLE_ENFORCE_EQ(DataType(), src.DataType(),
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s has different data type with Tensor %s",
+                          Name(), src.Name()));
+    PADDLE_ENFORCE_EQ(Type(), src.Type(),
+                      platform::errors::PreconditionNotMet(
+                          "Tensor %s has different type with Tensor %s, Tensor "
+                          "ShareGradientDataWith cannot be performed!",
+                          Name(), src.Name()));
+  }
+  VLOG(4) << " VarBase ShareDataWith " << src.Name();
+  if (grad_var_) {
+    auto& src_tensor = src.Var().Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(src_tensor.IsInitialized(), true,
+                      platform::errors::InvalidArgument(
+                          "tensor has not been initialized", src.Name()));
+    auto* grad_t = grad_var_->MutableVar()->GetMutable<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(grad_t->IsInitialized(), true,
+                      platform::errors::InvalidArgument(
+                          "tensor %s has not been initialized", Name()));
+    grad_t->ShareDataWith(src_tensor);
   }
 }
 
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 3443a61e659d5d..5834e3f76d71d3 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -230,6 +230,8 @@ class VarBase {
 
   void BumpInplaceVersion();
 
+  void _ShareDataWith(const imperative::VarBase& src);
+
   /* Hook related method: now only used for GradVarBase */
   bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }
 
@@ -252,10 +254,6 @@ class VarBase {
         std::forward<std::shared_ptr<std::function<void()>>>(hook));
   }
 
-  std::shared_ptr<VarBase> To(const platform::Place& dst_place,
-                              framework::proto::VarType::Type data_type,
-                              const bool blocking) const;
-
  private:
   /**
    * NOTE(zengjinle): never remove the const qualifier of `var_` if you are
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index f8a08fde47f269..05cfa127a27587 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -282,6 +282,27 @@ static void InitVarBaseFromTensorWithArgDefault(
   }
 }
 
+template <typename P>
+static void InitVarBaseFromTensorWithArg(imperative::VarBase *self,
+                                         const framework::Tensor &tensor,
+                                         const P &place) {
+  VLOG(4) << "Init VarBase";
+  new (self) imperative::VarBase(
+      imperative::GetCurrentTracer()->GenerateUniqueName("generated_tensor"));
+  self->SetPersistable(false);
+  self->SetType(framework::proto::VarType::LOD_TENSOR);
+  self->SetDataType(tensor.type());
+  auto *new_tensor = self->MutableVar()->GetMutable<framework::LoDTensor>();
+  // Same place，share data directly
+  if (platform::is_same_place(place, tensor.place())) {
+    new_tensor->ShareDataWith(tensor);
+    VLOG(4) << "Same place, do ShareDataWith";
+  } else {
+    framework::TensorCopy(tensor, place, new_tensor);
+    VLOG(4) << "Different place, do TensorCopy";
+  }
+}
+
 static std::string GetTypeName(const imperative::VarBase &var) {
   if (var.Type() == framework::proto::VarType::RAW) {
     return "RAW";
@@ -899,6 +920,16 @@ void BindImperative(py::module *m_ptr) {
            py::arg("stop_gradient") = -1)
       .def("__init__", &InitVarBaseFromNumpyWithArgDefault, py::arg("value"))
       .def("__init__", &InitVarBaseFromTensorWithArgDefault, py::arg("tensor"))
+      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CPUPlace>,
+           py::arg("tensor"), py::arg("place"))
+      .def("__init__", &InitVarBaseFromTensorWithArg<platform::XPUPlace>,
+           py::arg("tensor"), py::arg("place"))
+      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CUDAPlace>,
+           py::arg("tensor"), py::arg("place"))
+      .def("__init__", &InitVarBaseFromTensorWithArg<platform::CUDAPinnedPlace>,
+           py::arg("tensor"), py::arg("place"))
+      .def("__init__", &InitVarBaseFromTensorWithArg<platform::NPUPlace>,
+           py::arg("tensor"), py::arg("place"))
       .def("__init__", &InitVarBaseFromNumpyWithKwargs)
       .def(
           "__setitem_varbase__",
@@ -1909,7 +1940,7 @@ void BindImperative(py::module *m_ptr) {
                                    "tensor has not been initialized"));
              return t->IsSharedBufferWith(*t_t);
            })
-      .def("_Slice",
+      .def("_slice",
            [](const std::shared_ptr<imperative::VarBase> &self,
               int64_t begin_idx, int64_t end_idx) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
@@ -1918,28 +1949,17 @@ void BindImperative(py::module *m_ptr) {
                                    "tensor has not been initialized"));
              return t->Slice(begin_idx, end_idx);
            })
-      .def("_To",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::Place &place,
-              framework::proto::VarType::Type data_type, bool blocking) {
-             auto new_var = self->To(place, data_type, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
-      .def("_To",
-           [](const std::shared_ptr<imperative::VarBase> &self,
-              const platform::CPUPlace &place,
-              framework::proto::VarType::Type data_type, bool blocking) {
-             auto new_var = self->To(place, data_type, blocking);
-             if (!blocking) {
-               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
-             }
-             return new_var;
-           },
-           py::return_value_policy::copy)
+      .def("_share_data_with",
+           [](std::shared_ptr<imperative::VarBase> &self,
+              const imperative::VarBase &src) { self->_ShareDataWith(src); })
+      .def("_numel",
+           [](std::shared_ptr<imperative::VarBase> &self) {
+             auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
+             PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
+                               platform::errors::InvalidArgument(
+                                   "tensor has not been initialized"));
+             return t->numel();
+           })
       .def_property("name", &imperative::VarBase::Name,
                     &imperative::VarBase::SetName)
       .def_property("stop_gradient",
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index e2fd36448ba654..e4d579e15e98b3 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -357,6 +357,49 @@ def double_hook_fn(grad):
         helper = TensorHookRemoveHelper(self, hook_id)
         return helper
 
+    @framework.dygraph_only
+    def _to(self, device=None, dtype=None, blocking=None):
+
+        if device is None and dtype is None and blocking is None:
+            return self
+
+        if device is not None:
+            if isinstance(device, str):
+                device = paddle.device._convert_to_place(device)
+            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
+                                     core.CUDAPinnedPlace, core.XPUPlace)):
+                pass
+            else:
+                raise ValueError(
+                    "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
+                    + type(device).__name__)
+
+        if blocking is None:
+            blocking = True
+        else:
+            assert isinstance(
+                blocking,
+                bool), "blocking value error, must be the True, False or None"
+
+        def transform(t, device, dtype, blocking):
+            if device is None:
+                device = t.place
+            if dtype is None:
+                dtype = t.dtype
+
+            new_t = t._copy_to(device, blocking)
+
+            if dtype is not None and dtype != t.dtype:
+                new_t = new_t.cast(dtype=dtype)
+
+            return new_t
+
+        with warnings.catch_warnings():
+            warnings.filterwarnings("ignore", category=UserWarning)
+            return transform(self, device, dtype, blocking)
+
+        return self
+
     @property
     def grad(self):
         """
@@ -650,7 +693,7 @@ def is_combine_index(item):
         ("__deepcopy__", __deepcopy__), ("__module__", "paddle"),
         ("__name__", "Tensor"), ("__array__", __array__),
         ("__getitem__", __getitem__), ("item", item),
-        ("__setitem__", __setitem__)):
+        ("__setitem__", __setitem__), ("_to", _to)):
         setattr(core.VarBase, method_name, method)
 
     # NOTE(zhiqiu): pybind11 will set a default __str__ method of enum class.
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index cfaef15c1d335a..c909aa9bb48e61 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1154,5 +1154,105 @@ def test_bump_inplace_version(self):
         self.assertEqual(var.inplace_version, 2)
 
 
+class TestVarBaseSlice(unittest.TestCase):
+    def test_slice(self):
+        paddle.disable_static()
+        np_x = np.random.random((3, 8, 8))
+        x = paddle.to_tensor(np_x, dtype="float64")
+        actual_x = x._slice(0, 1)
+        actual_x = paddle.to_tensor(actual_x)
+        self.assertEqual(actual_x.numpy().all(), np_x[0:1].all())
+
+
+class TestVarBaseClear(unittest.TestCase):
+    def test_clear(self):
+        paddle.disable_static()
+        np_x = np.random.random((3, 8, 8))
+        x = paddle.to_tensor(np_x, dtype="float64")
+        x._clear()
+        self.assertEqual(str(x), "Tensor(Not initialized)")
+
+
+class TestVarBaseOffset(unittest.TestCase):
+    def test_offset(self):
+        paddle.disable_static()
+        np_x = np.random.random((3, 8, 8))
+        x = paddle.to_tensor(np_x, dtype="float64")
+        expected_offset = 0
+        actual_x = x._slice(expected_offset, 1)
+        actual_x = paddle.to_tensor(actual_x)
+        self.assertEqual(actual_x._offset(), expected_offset)
+
+
+class TestVarBaseShareBufferWith(unittest.TestCase):
+    def test_share_buffer_with(self):
+        paddle.disable_static()
+        np_x = np.random.random((3, 8, 8))
+        np_y = np.random.random((3, 8, 8))
+        x = paddle.to_tensor(np_x, dtype="float64")
+        y = paddle.to_tensor(np_y, dtype="float64")
+        x._share_buffer_with(y)
+        self.assertEqual(x._is_shared_buffer_with(y), True)
+
+
+class TestVarBaseTo(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.np_x = np.random.random((3, 8, 8))
+        self.x = paddle.to_tensor(self.np_x, dtype="float64")
+
+    def test_to_api(self):
+        x_double = self.x._to(dtype='double')
+        self.assertEqual(x_double.dtype, paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(np.allclose(self.np_x, x_double))
+
+        x_ = self.x._to()
+        self.assertEqual(self.x.dtype, paddle.fluid.core.VarDesc.VarType.FP64)
+        self.assertTrue(np.allclose(self.np_x, x_))
+
+        if paddle.fluid.is_compiled_with_cuda():
+            x_gpu = self.x._to(device=paddle.CUDAPlace(0))
+            self.assertTrue(x_gpu.place.is_gpu_place())
+            self.assertEqual(x_gpu.place.gpu_device_id(), 0)
+
+            x_gpu0 = self.x._to(device='gpu:0')
+            self.assertTrue(x_gpu0.place.is_gpu_place())
+            self.assertEqual(x_gpu0.place.gpu_device_id(), 0)
+
+        x_cpu = self.x._to(device=paddle.CPUPlace())
+        self.assertTrue(x_cpu.place.is_cpu_place())
+
+        x_cpu0 = self.x._to(device='cpu')
+        self.assertTrue(x_cpu0.place.is_cpu_place())
+
+
+class TestVarBaseInitVarBaseFromTensorWithDevice(unittest.TestCase):
+    def test_varbase_init(self):
+        paddle.disable_static()
+        t = fluid.Tensor()
+        np_x = np.random.random((3, 8, 8))
+        t.set(np_x, fluid.CPUPlace())
+
+        if paddle.fluid.is_compiled_with_cuda():
+            device = paddle.CUDAPlace(0)
+            tmp = fluid.core.VarBase(t, device)
+            self.assertTrue(tmp.place.is_gpu_place())
+            self.assertEqual(tmp.numpy().all(), np_x.all())
+
+        device = paddle.CPUPlace()
+        tmp = fluid.core.VarBase(t, device)
+        self.assertEqual(tmp.numpy().all(), np_x.all())
+
+
+class TestVarBaseNumel(unittest.TestCase):
+    def test_numel(self):
+        paddle.disable_static()
+        np_x = np.random.random((3, 8, 8))
+        x = paddle.to_tensor(np_x, dtype="float64")
+        x_actual_numel = x._numel()
+        x_expected_numel = np.product((3, 8, 8))
+        self.assertEqual(x_actual_numel, x_expected_numel)
+
+
 if __name__ == '__main__':
     unittest.main()

From 8e1312a27649fb41690596a308bfcd19ab5f8c6b Mon Sep 17 00:00:00 2001
From: veyron95 <veyron_wu@163.com>
Date: Wed, 3 Nov 2021 14:22:56 +0000
Subject: [PATCH 3/7] Change func name and add test case for _CopyGradientWith

---
 paddle/fluid/imperative/layer.cc                    | 10 +++++++---
 paddle/fluid/imperative/layer.h                     |  2 +-
 paddle/fluid/pybind/imperative.cc                   |  4 ++--
 .../paddle/fluid/tests/unittests/test_var_base.py   | 13 +++++++++++++
 4 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index d24af5dfef826c..27c9c659fe6fb5 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -356,8 +356,10 @@ void VarBase::BumpInplaceVersion() {
   MutableVar()->BumpInplaceVersion();
 }
 
-// NOTE(weilong wu): This func try to share grad_var_ data with target varbase
-void VarBase::_ShareDataWith(const VarBase& src) {
+// NOTE(weilong wu):
+// This function try to copy the data from target varbase,
+// and fill into the grad_var_ of the current varbase.
+void VarBase::_CopyGradientWith(const VarBase& src) {
   if (Var().IsInitialized()) {
     PADDLE_ENFORCE_EQ(DataType(), src.DataType(),
                       platform::errors::PreconditionNotMet(
@@ -369,7 +371,7 @@ void VarBase::_ShareDataWith(const VarBase& src) {
                           "ShareGradientDataWith cannot be performed!",
                           Name(), src.Name()));
   }
-  VLOG(4) << " VarBase ShareDataWith " << src.Name();
+  VLOG(4) << " VarBase copy gradient with " << src.Name();
   if (grad_var_) {
     auto& src_tensor = src.Var().Get<framework::LoDTensor>();
     PADDLE_ENFORCE_EQ(src_tensor.IsInitialized(), true,
@@ -379,7 +381,9 @@ void VarBase::_ShareDataWith(const VarBase& src) {
     PADDLE_ENFORCE_EQ(grad_t->IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "tensor %s has not been initialized", Name()));
+    auto* var_ = MutableVar()->GetMutable<framework::LoDTensor>();
     grad_t->ShareDataWith(src_tensor);
+    grad_t->Resize(var_->dims());
   }
 }
 
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 5834e3f76d71d3..650998a356e329 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -230,7 +230,7 @@ class VarBase {
 
   void BumpInplaceVersion();
 
-  void _ShareDataWith(const imperative::VarBase& src);
+  void _CopyGradientWith(const imperative::VarBase& src);
 
   /* Hook related method: now only used for GradVarBase */
   bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 05cfa127a27587..1f60da627cdc23 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1949,9 +1949,9 @@ void BindImperative(py::module *m_ptr) {
                                    "tensor has not been initialized"));
              return t->Slice(begin_idx, end_idx);
            })
-      .def("_share_data_with",
+      .def("_copy_gradient_with",
            [](std::shared_ptr<imperative::VarBase> &self,
-              const imperative::VarBase &src) { self->_ShareDataWith(src); })
+              const imperative::VarBase &src) { self->_CopyGradientWith(src); })
       .def("_numel",
            [](std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index c909aa9bb48e61..31b6c6c5b83bd4 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1254,5 +1254,18 @@ def test_numel(self):
         self.assertEqual(x_actual_numel, x_expected_numel)
 
 
+class TestVarBaseCopyGradientWith(unittest.TestCase):
+    def test_copy_gradient_with(self):
+        paddle.disable_static()
+        np_x = np.random.random((2, 2))
+        np_y = np.random.random((2, 2))
+        x = paddle.to_tensor(np_x, dtype="float64", stop_gradient=False)
+        y = paddle.to_tensor(np_y, dtype="float64")
+        out = x + x
+        out.backward()
+        x._copy_gradient_with(y)
+        self.assertEqual(x.grad.numpy().all(), np_y.all())
+
+
 if __name__ == '__main__':
     unittest.main()

From ed8980d8c1d2887869a79f92dd1753744886a09e Mon Sep 17 00:00:00 2001
From: veyron95 <veyron_wu@163.com>
Date: Wed, 3 Nov 2021 18:42:43 +0000
Subject: [PATCH 4/7] Rename func

---
 paddle/fluid/imperative/layer.cc                     | 2 +-
 paddle/fluid/imperative/layer.h                      | 2 +-
 paddle/fluid/pybind/imperative.cc                    | 4 ++--
 python/paddle/fluid/tests/unittests/test_var_base.py | 6 +++---
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 27c9c659fe6fb5..f2469e613b0d6c 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -359,7 +359,7 @@ void VarBase::BumpInplaceVersion() {
 // NOTE(weilong wu):
 // This function try to copy the data from target varbase,
 // and fill into the grad_var_ of the current varbase.
-void VarBase::_CopyGradientWith(const VarBase& src) {
+void VarBase::_CopyGradientFrom(const VarBase& src) {
   if (Var().IsInitialized()) {
     PADDLE_ENFORCE_EQ(DataType(), src.DataType(),
                       platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 650998a356e329..f66f72a48fba9e 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -230,7 +230,7 @@ class VarBase {
 
   void BumpInplaceVersion();
 
-  void _CopyGradientWith(const imperative::VarBase& src);
+  void _CopyGradientFrom(const imperative::VarBase& src);
 
   /* Hook related method: now only used for GradVarBase */
   bool HasVariableWrapperHook() const { return var_->HasVariableWrapperHook(); }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 1f60da627cdc23..ced5d0390a02e3 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1949,9 +1949,9 @@ void BindImperative(py::module *m_ptr) {
                                    "tensor has not been initialized"));
              return t->Slice(begin_idx, end_idx);
            })
-      .def("_copy_gradient_with",
+      .def("_copy_gradient_from",
            [](std::shared_ptr<imperative::VarBase> &self,
-              const imperative::VarBase &src) { self->_CopyGradientWith(src); })
+              const imperative::VarBase &src) { self->_CopyGradientFrom(src); })
       .def("_numel",
            [](std::shared_ptr<imperative::VarBase> &self) {
              auto *t = self->MutableVar()->GetMutable<framework::LoDTensor>();
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 31b6c6c5b83bd4..de02d08fdda6d4 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1254,8 +1254,8 @@ def test_numel(self):
         self.assertEqual(x_actual_numel, x_expected_numel)
 
 
-class TestVarBaseCopyGradientWith(unittest.TestCase):
-    def test_copy_gradient_with(self):
+class TestVarBaseCopyGradientFrom(unittest.TestCase):
+    def test_copy_gradient_from(self):
         paddle.disable_static()
         np_x = np.random.random((2, 2))
         np_y = np.random.random((2, 2))
@@ -1263,7 +1263,7 @@ def test_copy_gradient_with(self):
         y = paddle.to_tensor(np_y, dtype="float64")
         out = x + x
         out.backward()
-        x._copy_gradient_with(y)
+        x._copy_gradient_from(y)
         self.assertEqual(x.grad.numpy().all(), np_y.all())
 
 

From c54c549d67986bd428e2db2fb1d36ec6cbd8799a Mon Sep 17 00:00:00 2001
From: veyron95 <veyron_wu@163.com>
Date: Thu, 4 Nov 2021 13:51:17 +0000
Subject: [PATCH 5/7] Add test cases to increase coverage

---
 python/paddle/fluid/dygraph/varbase_patch_methods.py | 2 --
 python/paddle/fluid/tests/unittests/test_var_base.py | 3 +++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index e4d579e15e98b3..6f155360e4e165 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -398,8 +398,6 @@ def transform(t, device, dtype, blocking):
             warnings.filterwarnings("ignore", category=UserWarning)
             return transform(self, device, dtype, blocking)
 
-        return self
-
     @property
     def grad(self):
         """
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index de02d08fdda6d4..5a855bfc5ff1dd 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1225,6 +1225,9 @@ def test_to_api(self):
         x_cpu0 = self.x._to(device='cpu')
         self.assertTrue(x_cpu0.place.is_cpu_place())
 
+        self.assertRaises(ValueError, self.x._to, device=1)
+        self.assertRaises(AssertionError, self.x._to, blocking=1)
+
 
 class TestVarBaseInitVarBaseFromTensorWithDevice(unittest.TestCase):
     def test_varbase_init(self):

From fe60c02e78927c3fdd2b8d1e120827a6df3c695e Mon Sep 17 00:00:00 2001
From: veyron95 <veyron_wu@163.com>
Date: Tue, 9 Nov 2021 08:39:39 +0000
Subject: [PATCH 6/7] Refine the logic of _to func

---
 .../fluid/dygraph/varbase_patch_methods.py    | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 6f155360e4e165..75df8e7f29d31b 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -387,12 +387,46 @@ def transform(t, device, dtype, blocking):
             if dtype is None:
                 dtype = t.dtype
 
-            new_t = t._copy_to(device, blocking)
+            # 1. gpu place need to determine whether the memory is sufficient for allocation.
+            if t.place.is_gpu_place():
+                gpu_memory_available = core.gpu_memory_available()
+                # for gpu, minimum memory allocation unit is 256 bytes.
+                if type(dtype) is str:
+                    size_dtype = core.size_of_dtype(
+                        framework.convert_np_dtype_to_dtype_(dtype))
+                else:
+                    size_dtype = core.size_of_dtype(dtype)
+                # Note(weilong wu): Paddle GPU minimum memory allocation unit is 256 bytes,
+                # waiting_alloc_memory will compute the memory space occupied by 't'.
+                # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
+                waiting_alloc_memory = (
+                    (t.numel().numpy()[0] * size_dtype) / 256 + 1) * 256 * 1.2
+                if gpu_memory_available < waiting_alloc_memory:
+                    # Copy Tensor to cpu
+                    t_used = t._copy_to(paddle.CPUPlace(), blocking)
+                    # Release memory of t
+                    t.value().get_tensor()._clear()
+                else:
+                    # Tensor still in GPU
+                    t_used = t
+            else:
+                t_used = t
+
+            # 2. cast Tensor to dtype
+            if dtype is not None and dtype != t_used.dtype:
+                t_casted = t_used.cast(dtype=dtype)
+            else:
+                t_casted = t_used
+
+            # 3. Copy casted Tensor(in CPU or GPU) to device
+            new_t = t_casted._copy_to(device, blocking)
 
-            if dtype is not None and dtype != t.dtype:
-                new_t = new_t.cast(dtype=dtype)
+            # 4. Share Tensor to origin Tensor
+            dst_tensor = t.value().get_tensor()
+            src_tensor = new_t.value().get_tensor()
+            dst_tensor._share_data_with(src_tensor)
 
-            return new_t
+            return t
 
         with warnings.catch_warnings():
             warnings.filterwarnings("ignore", category=UserWarning)

From c6a16b313ae4d28e2c2348995c14918961ff346f Mon Sep 17 00:00:00 2001
From: veyron95 <veyron_wu@163.com>
Date: Tue, 9 Nov 2021 14:22:51 +0000
Subject: [PATCH 7/7] Replace numel() with _numel(), Add test code

---
 .../fluid/dygraph/varbase_patch_methods.py    |  4 ++--
 .../fluid/tests/unittests/test_var_base.py    | 22 ++++++++++++++++++-
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index 75df8e7f29d31b..32a4b5145effe3 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -400,12 +400,12 @@ def transform(t, device, dtype, blocking):
                 # waiting_alloc_memory will compute the memory space occupied by 't'.
                 # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
                 waiting_alloc_memory = (
-                    (t.numel().numpy()[0] * size_dtype) / 256 + 1) * 256 * 1.2
+                    (t._numel() * size_dtype) / 256 + 1) * 256 * 1.2
                 if gpu_memory_available < waiting_alloc_memory:
                     # Copy Tensor to cpu
                     t_used = t._copy_to(paddle.CPUPlace(), blocking)
                     # Release memory of t
-                    t.value().get_tensor()._clear()
+                    t._clear()
                 else:
                     # Tensor still in GPU
                     t_used = t
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 5a855bfc5ff1dd..95f7c0aca788aa 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -1199,7 +1199,7 @@ class TestVarBaseTo(unittest.TestCase):
     def setUp(self):
         paddle.disable_static()
         self.np_x = np.random.random((3, 8, 8))
-        self.x = paddle.to_tensor(self.np_x, dtype="float64")
+        self.x = paddle.to_tensor(self.np_x, dtype="float32")
 
     def test_to_api(self):
         x_double = self.x._to(dtype='double')
@@ -1219,12 +1219,32 @@ def test_to_api(self):
             self.assertTrue(x_gpu0.place.is_gpu_place())
             self.assertEqual(x_gpu0.place.gpu_device_id(), 0)
 
+            x_gpu1 = self.x._to(device='gpu:0', dtype="float64")
+            self.assertTrue(x_gpu1.place.is_gpu_place())
+            self.assertEqual(x_gpu1.place.gpu_device_id(), 0)
+            self.assertEqual(x_gpu1.dtype,
+                             paddle.fluid.core.VarDesc.VarType.FP64)
+
+            x_gpu2 = self.x._to(device='gpu:0', dtype="float16")
+            self.assertTrue(x_gpu2.place.is_gpu_place())
+            self.assertEqual(x_gpu2.place.gpu_device_id(), 0)
+            self.assertEqual(x_gpu2.dtype,
+                             paddle.fluid.core.VarDesc.VarType.FP16)
+
         x_cpu = self.x._to(device=paddle.CPUPlace())
         self.assertTrue(x_cpu.place.is_cpu_place())
 
         x_cpu0 = self.x._to(device='cpu')
         self.assertTrue(x_cpu0.place.is_cpu_place())
 
+        x_cpu1 = self.x._to(device=paddle.CPUPlace(), dtype="float64")
+        self.assertTrue(x_cpu1.place.is_cpu_place())
+        self.assertEqual(x_cpu1.dtype, paddle.fluid.core.VarDesc.VarType.FP64)
+
+        x_cpu2 = self.x._to(device='cpu', dtype="float16")
+        self.assertTrue(x_cpu2.place.is_cpu_place())
+        self.assertEqual(x_cpu2.dtype, paddle.fluid.core.VarDesc.VarType.FP16)
+
         self.assertRaises(ValueError, self.x._to, device=1)
         self.assertRaises(AssertionError, self.x._to, blocking=1)