From 8e89dd36f1c9f11ad7ca66cd384f7d46d66fdae9 Mon Sep 17 00:00:00 2001
From: Lu Qi <61354321+MarioLulab@users.noreply.github.com>
Date: Fri, 22 Sep 2023 10:21:19 +0800
Subject: [PATCH] [GLCC]Part-3: Support jit.save and jit.load for pylayer op
 (#57066)

* complete static_pylayer op

* finish static_pylayer op context manager

* finish single test

* append import path

* maybe modify test/ir/inference

* percept static_pylayer op in dy2st
---
 paddle/fluid/framework/prune.cc               | 181 +++++++--
 python/paddle/jit/dy2static/py_layer.py       |  15 +-
 python/paddle/static/io.py                    |  14 +
 python/paddle/static/nn/static_pylayer.py     |  35 +-
 test/dygraph_to_static/test_pylayer.py        | 344 ++++++++++++++++-
 test/legacy_test/test_jit_save_load.py        |   4 +-
 .../test_program_prune_backward.py            |  89 +++++
 test/legacy_test/test_prune.py                | 365 ++++++++++--------
 test/legacy_test/test_static_pylayer.py       | 252 +++++++++++-
 9 files changed, 1078 insertions(+), 221 deletions(-)
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index d4c2021d5f6e16..93467b549d6e9a 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -30,6 +30,8 @@ const char kRecurrent[] = "recurrent";  // NOLINT
 const char kStates[] = "states";        // NOLINT
 const char kExStates[] = "ex_states";   // NOLINT
 
+const char kPyLayer[] = "pylayer";  // NOLINT
+
 bool HasDependentInputVar(
     const proto::OpDesc& op_desc,
     const std::unordered_set<std::string>& dependent_vars) {
@@ -86,6 +88,23 @@ int GetSubBlockIndex(const proto::OpDesc& op_desc) {
   return -1;
 }
 
+void GetSubBlocksIndices(const proto::OpDesc& op_desc,
+                         std::vector<int>* indices) {
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCKS) {
+      PADDLE_ENFORCE_GT(
+          attr.blocks_idx_size(),
+          0,
+          platform::errors::NotFound(
+              "Attribute blocks is not found in operator %s", op_desc.type()));
+      indices->resize(attr.blocks_idx_size());
+      for (int i = 0; i < attr.blocks_idx_size(); i++) {
+        (*indices)[i] = attr.blocks_idx(i);
+      }
+    }
+  }
+}
+
 void SetSubBlockIndex(proto::OpDesc* op_desc, int sub_idx) {
   for (auto& attr : *op_desc->mutable_attrs()) {
     if (attr.type() == proto::AttrType::BLOCK) {
@@ -99,10 +118,43 @@ void SetSubBlockIndex(proto::OpDesc* op_desc, int sub_idx) {
   }
 }
 
+void SetSubBlocksIndices(proto::OpDesc* op_desc,
+                         const std::vector<int>& sub_indices) {
+  for (auto& attr : *op_desc->mutable_attrs()) {
+    if (attr.type() == proto::AttrType::BLOCKS) {
+      PADDLE_ENFORCE_GT(
+          attr.blocks_idx_size(),
+          0,
+          platform::errors::NotFound(
+              "Attribute blocks is not found in operator %s", op_desc->type()));
+      attr.clear_blocks_idx();
+      for (auto idx : sub_indices) {
+        attr.add_blocks_idx(idx);
+      }
+    }
+  }
+}
+
 bool HasSubBlock(const proto::OpDesc& op_desc) {
   return GetSubBlockIndex(op_desc) > 0;
 }
 
+bool HasSubBlocks(const proto::OpDesc& op_desc) {
+  // ``blocks_idx_size() == 0`` indicates no sub blocks.
+  for (auto& attr : op_desc.attrs()) {
+    if (attr.type() == proto::AttrType::BLOCKS) {
+      PADDLE_ENFORCE_GT(
+          attr.blocks_idx_size(),
+          0,
+          platform::errors::NotFound(
+              "Attribute blocks is not found in operator %s", op_desc.type()));
+      return true;
+    }
+  }
+
+  return false;
+}
+
 int GetOpRole(const proto::OpDesc& op_desc) {
   for (auto& attr : op_desc.attrs()) {
     if (attr.name() == OpProtoAndCheckerMaker::OpRoleAttrName()) {
@@ -150,14 +202,15 @@ int FindMapByValue(const std::map<int, int>& m, int val) {
 }
 
 // In other two cases, the op that has feed vars as output vars is dependent:
-// 1. op has subblock, like while/for/ifelse/recurrent
+// 1. op has subblock, like while/for/ifelse/recurrent/pylayer
 // 2. op is in subblock
 bool IsSubBlockDependent(const proto::OpDesc& op_desc,
                          const std::set<std::string>& feed_vars,
                          int parent_block_id) {
   for (auto& var : op_desc.outputs()) {
     for (auto& argu : var.arguments()) {
-      if ((HasSubBlock(op_desc) || parent_block_id != -1) &&
+      if ((HasSubBlock(op_desc) || HasSubBlocks(op_desc) ||
+           parent_block_id != -1) &&
           feed_vars.count(argu) != 0) {
         return true;
       }
@@ -289,7 +342,7 @@ void prune_impl(const proto::ProgramDesc& input,
     if (should_run[i]) {
       auto* op = op_field->Add();
       *op = input.blocks(block_id).ops(static_cast<int>(i));
-      if (HasSubBlock(*op)) {
+      if (HasSubBlock(*op) || HasSubBlocks(*op)) {
         VLOG(2) << "Pruning op which has sub block: " << op->type();
         // create sub_block_dependent_vars here to help prune the sub block
         std::unordered_set<std::string> sub_block_dependent_vars;
@@ -321,15 +374,41 @@ void prune_impl(const proto::ProgramDesc& input,
             }
           }
         }
-        // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
-        // output_block_id is the idx of the current block in the output desc
-        prune_impl(input,
-                   output,
-                   GetSubBlockIndex(*op),
-                   output_block_id,
-                   &sub_block_dependent_vars,
-                   feed_var_names,
-                   pruned_origin_block_id_map);
+        if (HasSubBlock(*op)) {
+          // GetSubBlockIndex(*op) is the idx of the sub_block in the input desc
+          // output_block_id is the idx of the current block in the output desc
+          prune_impl(input,
+                     output,
+                     GetSubBlockIndex(*op),
+                     output_block_id,
+                     &sub_block_dependent_vars,
+                     feed_var_names,
+                     pruned_origin_block_id_map);
+        } else if (HasSubBlocks(*op)) {
+          // GetSubBlocksIndices(*op) are the indices of the sub_blocks in the
+          // input desc output_block_id is the idx of the current block in the
+          // output desc
+          std::vector<int> sub_indices;
+          GetSubBlocksIndices(*op, &sub_indices);
+          for (auto& sub_index : sub_indices) {
+            // create a copy of dependent_vars to avoid being overwrited by the
+            // other sub_block
+            std::unordered_set<std::string> dependent_vars_copy =
+                sub_block_dependent_vars;
+            prune_impl(input,
+                       output,
+                       sub_index,
+                       output_block_id,
+                       &dependent_vars_copy,
+                       feed_var_names,
+                       pruned_origin_block_id_map);
+          }
+        } else {
+          PADDLE_ENFORCE(false,
+                         platform::errors::PreconditionNotMet(
+                             "Attr Block or Blocks must exist when recursively "
+                             "calling prune_impl"));
+        }
       }
     }
   }
@@ -402,12 +481,29 @@ std::map<int, int> Prune(const proto::ProgramDesc& input,
         int origin_sub_idx = GetSubBlockIndex(op_desc);
         auto sub_idx =
             FindMapByValue(pruned_origin_block_id_map, origin_sub_idx);
-        PADDLE_ENFORCE_NE(sub_idx,
-                          -1,
-                          platform::errors::NotFound(
-                              "The origin sub block id should be found in "
-                              "pruned_progin_block_id_map"));
+        PADDLE_ENFORCE_NE(
+            sub_idx,
+            -1,
+            platform::errors::NotFound(
+                "The origin sub block id should be found in "
+                "pruned_progin_block_id_map when the op has sub_block"));
         SetSubBlockIndex(&op_desc, sub_idx);
+      } else if (HasSubBlocks(op_desc)) {
+        std::vector<int> origin_sub_indices;
+        GetSubBlocksIndices(op_desc, &origin_sub_indices);
+        std::vector<int> sub_indices;
+        for (int index : origin_sub_indices) {
+          auto sub_idx = FindMapByValue(pruned_origin_block_id_map, index);
+          PADDLE_ENFORCE_NE(
+              sub_idx,
+              -1,
+              platform::errors::NotFound(
+                  "The origin sub block id should be found in "
+                  "pruned_progin_block_id_map when the op has sub_blocks"));
+          sub_indices.push_back(sub_idx);
+        }
+
+        SetSubBlocksIndices(&op_desc, sub_indices);
       }
     }
   }
@@ -441,6 +537,19 @@ void PruneBackwardImpl(proto::BlockDesc* origin, proto::BlockDesc* pruned) {
       AppendOpInputVarNames(op_desc, &op_input_vars);
       AppendOpOutputVarNames(op_desc, &op_output_vars);
       *op = op_desc;
+
+      // if the type of op is "pylayer", we need to update the ``blocks``
+      // attribute because the backward block will be pruned
+      if (op->type() == kPyLayer && HasSubBlocks(*op)) {
+        std::vector<int> sub_indices;
+        GetSubBlocksIndices(*op, &sub_indices);
+        if (sub_indices.size() > 1) {
+          // sub_indices contains both forward block id and backward block id
+          std::vector<int> new_sub_indices(sub_indices.begin(),
+                                           sub_indices.end() - 1);
+          SetSubBlocksIndices(op, new_sub_indices);
+        }
+      }
     }
   }
 
@@ -471,9 +580,10 @@ std::tuple<framework::ProgramDesc, std::map<int, int>> PruneBackward(
   // Copy original ProgramDesc, origin can't be change
   framework::ProgramDesc origin_clone(origin);
 
-  // Step 1. check if the program contains grad loss operator.
-  // If not, the program need no pruning.
+  // Step 1. check if the program contains grad loss operator or pylayer
+  // operator. If not, the program need no pruning.
   bool has_loss_grad_op = false;
+  bool has_pylayer_op = false;
   std::queue<int> block_contains_loss;
   std::queue<int> block_contains_loss_grad;
   for (size_t i = 0; i < origin_clone.Size(); i++) {
@@ -485,13 +595,15 @@ std::tuple<framework::ProgramDesc, std::map<int, int>> PruneBackward(
                       static_cast<int>(OpRole::kLoss))) {
         op->SetIsTarget(false);
         has_loss_grad_op = true;
-        break;
+      }
+      if (op->Type() == kPyLayer) {
+        has_pylayer_op = true;
       }
     }
   }
 
   std::map<int, int> pruned_progin_block_id_map;
-  if (!has_loss_grad_op) {
+  if (!has_loss_grad_op && !has_pylayer_op) {
     // No pruning, fast return a copy of the origin ProgramDesc with an empty
     // map, means default mapped, i.e.{0:0, 1:1, ..., n:n}.
     return std::make_tuple(framework::ProgramDesc(origin_clone),
@@ -544,12 +656,29 @@ std::tuple<framework::ProgramDesc, std::map<int, int>> PruneBackward(
         int origin_sub_idx = GetSubBlockIndex(op_desc);
         auto sub_idx =
             FindMapByValue(pruned_progin_block_id_map, origin_sub_idx);
-        PADDLE_ENFORCE_NE(sub_idx,
-                          -1,
-                          platform::errors::NotFound(
-                              "The origin sub block id is not found in "
-                              "pruned_progin_block_id_map"));
+        PADDLE_ENFORCE_NE(
+            sub_idx,
+            -1,
+            platform::errors::NotFound(
+                "The origin sub block id is not found in "
+                "pruned_progin_block_id_map when the op has sub_block"));
         SetSubBlockIndex(&op_desc, sub_idx);
+      } else if (HasSubBlocks(op_desc)) {
+        std::vector<int> origin_sub_indices;
+        GetSubBlocksIndices(op_desc, &origin_sub_indices);
+        std::vector<int> sub_indices;
+        for (int index : origin_sub_indices) {
+          auto sub_idx = FindMapByValue(pruned_progin_block_id_map, index);
+          PADDLE_ENFORCE_NE(
+              sub_idx,
+              -1,
+              platform::errors::NotFound(
+                  "The origin sub block id should be found in "
+                  "pruned_progin_block_id_map when the op has sub_blocks"));
+          sub_indices.push_back(sub_idx);
+        }
+
+        SetSubBlocksIndices(&op_desc, sub_indices);
       }
     }
   }
diff --git a/python/paddle/jit/dy2static/py_layer.py b/python/paddle/jit/dy2static/py_layer.py
index 1d238e667c6535..b32397b0aa3ee1 100644
--- a/python/paddle/jit/dy2static/py_layer.py
+++ b/python/paddle/jit/dy2static/py_layer.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import functools
+import inspect
 
 from paddle.base.framework import Variable
 from paddle.common_ops_import import LayerHelper
@@ -73,9 +74,19 @@ def __init__(self, dyfunc_self):
         )
 
     # NOTE: only support position args and Variables Now
-    def apply(self, *args):
+    def apply(self, *args, **kwargs):
+        # rearrange `position-args + keyword-args` into `position-args`
+        dyfunc_sig = inspect.signature(self.dyfunc_self.forward)
+        bound_args = dyfunc_sig.bind(self.dyfunc_self, *args, **kwargs)
+        bound_args.apply_defaults()
+        input_args = [
+            item
+            for i, item in enumerate(bound_args.arguments.values())
+            if i > 0
+        ]  # index 0 indicate `dyfunc_self` which shouldn't be put into `input_args`
+
         return static_pylayer(
             forward_fn=self.forward_fn_with_ctx,
-            inputs=list(args),
+            inputs=input_args,
             backward_fn=self.backward_fn_with_ctx,
         )
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index 8f68f3f9e89bfd..943e8525ba466d 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -274,6 +274,20 @@ def normalize_program(program, feed_vars, fetch_vars, **kwargs):
         op.desc.set_is_target(False)
         if op.type == "feed" or op.type == "fetch":
             remove_op_idx.append(i)
+
+        if op.type == "pylayer":
+            sub_blocks_ids = op._blocks_attr_ids("blocks")
+            if len(sub_blocks_ids) > 1:
+                # pylayer op ``blocks`` attr contains forward block id and backward block id
+                backward_block_id = sub_blocks_ids[-1]
+                # remove backward block
+                copy_program.blocks.pop(backward_block_id)
+                # update attrs ``blocks``
+                reserverd_blocks = []
+                for block_id in sub_blocks_ids[:-1]:
+                    reserverd_blocks.append(copy_program.block(block_id))
+                op._update_desc_attr("blocks", reserverd_blocks)
+
     for idx in remove_op_idx[::-1]:
         global_block._remove_op(idx)
     copy_program.desc.flush()
diff --git a/python/paddle/static/nn/static_pylayer.py b/python/paddle/static/nn/static_pylayer.py
index 91d0f9d2351ffe..3dcf35e50e54b0 100644
--- a/python/paddle/static/nn/static_pylayer.py
+++ b/python/paddle/static/nn/static_pylayer.py
@@ -45,11 +45,13 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 class StaticPyLayerBlock:
     def __init__(self, inputs, name=None, pylayer_context=None):
-        for each_input in inputs:
-            check_type(each_input, "input", Variable, "StaticPyLayerBlock")
+        # used to specify the Variable type `Input` to `pylayer` op
+        self.fwd_inputs = [
+            each_input
+            for each_input in inputs
+            if isinstance(each_input, Variable)
+        ]  # filter non-Variable inputs
 
-        # used to specify the `Input` to `pylayer` op
-        self.fwd_inputs = inputs
         # used to specify the `Out` to `pylayer` op
         self.fwd_outputs = []
 
@@ -105,7 +107,7 @@ def complete_backward_block(self):
         parent_block = self.helper.main_program.block(inside_block.parent_idx)
 
         self._backward_block_id = inside_block.idx
-        # set OpRole to `backward`
+        # Set OpRole to `backward`. The operators marked as `backward` are expected to be pruned in PruneBackward.
         for op in inside_block.ops:
             op_role_attr_name = (
                 core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -234,8 +236,6 @@ def copy_var_from_parent_block(parent_block_var, layer_helper):
     return current_block_var
 
 
-# TODO(MarioLulab):
-# Need to support non-Variable in ``inputs``
 def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
     """
     This API returns ``forward_fn(inputs)``, and two sub-block are created based on
@@ -344,7 +344,9 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
         origin_output = forward_fn(*inputs)
         if origin_output is not None:
             output = map_structure(copy_to_parent_func, origin_output)
-            mgr.fwd_outputs = flatten(output)
+            mgr.fwd_outputs = [
+                x for x in flatten(output) if isinstance(x, Variable)
+            ]
         else:
             mgr.fwd_outputs = []
 
@@ -358,7 +360,7 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
         # **Create the backward input** from the output of the op to build the
         # backward block, and then delete it.
         grad_var_ins = []
-        for fwd_var in flatten(output):
+        for fwd_var in pylayer_block_manager.fwd_outputs:
             fwd_var_name = fwd_var.name
             bwd_var_name = _append_grad_suffix_(fwd_var_name)
             if not current_block.desc.has_var_recursive(fwd_var_name.encode()):
@@ -405,7 +407,7 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
                     but got {len(forward_input_names)} and {len(flat_grad_origin)}"
 
                 # Step4. Rename var name with suffix of "@GRAD"
-                for bwd_output_name, fwd_input_name in zip(
+                for bwd_output, fwd_input_name in zip(
                     flat_grad_origin, forward_input_names
                 ):
                     # NOTE(MarioLulab): Because `flat_grad_origin` are the Variables inside the backward block, which one by one corresponds
@@ -428,12 +430,13 @@ def static_pylayer(forward_fn, inputs, backward_fn=None, name=None):
                     # TODO(MarioLulab): We will validate the assumption above is whether a strong hypothesis or not.
 
                     # attach old var name into new
-                    bwd_out_new = _append_grad_suffix_(
-                        fwd_input_name
-                    )  # "X" => "X@GRAD"
-                    mgr.var_old_to_new[
-                        bwd_output_name.name
-                    ] = bwd_out_new  # e.g. "tmp_0.mean_0": "X@GRAD"
+                    if isinstance(bwd_output, Variable):
+                        bwd_out_new = _append_grad_suffix_(
+                            fwd_input_name
+                        )  # "X" => "X@GRAD"
+                        mgr.var_old_to_new[
+                            bwd_output.name
+                        ] = bwd_out_new  # e.g. "tmp_0.mean_0": "X@GRAD"
 
         # **Delete the backward input**
         for bwd_var in grad_var_ins:
diff --git a/test/dygraph_to_static/test_pylayer.py b/test/dygraph_to_static/test_pylayer.py
index 88558e3d628fb4..ee2d1248e5f634 100644
--- a/test/dygraph_to_static/test_pylayer.py
+++ b/test/dygraph_to_static/test_pylayer.py
@@ -15,9 +15,12 @@
 """Tests for PyLayer of Dynamic-to-Static.
 Only test simple cases here."""
 
+import os
+import tempfile
 import unittest
 
 import numpy as np
+from legacy_test.test_jit_save_load import train
 
 import paddle
 from paddle.autograd.py_layer import PyLayer
@@ -51,7 +54,7 @@ def backward(ctx, dy):
 class scaled_layer_2(PyLayer):
     @staticmethod
     def forward(ctx, x1, x2):
-        y = x1 * x2
+        y = 3 * x1 + x2 / 5
         return y
 
     @staticmethod
@@ -75,6 +78,78 @@ def backward(ctx, dy):
         return grad
 
 
+class cus_tanh_2(PyLayer):
+    @staticmethod
+    def forward(ctx, x, func1, func2=paddle.square):
+        ctx.func = func2
+        y = func1(x)
+        ctx.save_for_backward(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensor()
+        grad = dy * (1 - ctx.func(y))
+        return grad
+
+
+class cus_tanh_3(PyLayer):
+    @staticmethod
+    def forward(ctx, x1, x2, func1, func2=paddle.square):
+        ctx.func = func2
+        y1 = func1(x1)
+        y2 = func1(x2)
+        ctx.save_for_backward(y1, y2)
+        return 1, None, y1, y2, ''
+
+    @staticmethod
+    def backward(ctx, dy1, dy2):
+        y1, y2 = ctx.saved_tensor()
+        re1 = dy1 * (1 - ctx.func(y1))
+        re2 = dy2 * (1 - paddle.square(y2))
+        return re1, None
+
+
+def user_defined_tanh(x):
+    y = paddle.tanh(x)
+    return y
+
+
+def user_defined_square(x):
+    y = paddle.square(x)
+    return y
+
+
+class cus_tanh_4(PyLayer):
+    @staticmethod
+    def forward(ctx, x, func, name="cus_tanh_4"):
+        ctx.func = func
+        y = user_defined_tanh(x)
+        ctx.save_for_backward(y)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (y,) = ctx.saved_tensor()
+        grad = dy * (1 - ctx.func(y))
+        return grad
+
+
+class cus_sigmoid(PyLayer):
+    @staticmethod
+    def forward(ctx, x, func1, func2):
+        ctx.func = func2
+        y = 1 / (1 + func1(-x))
+        ctx.save_for_backward(x)
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        (x,) = ctx.saved_tensor()
+        grad = dy * ctx.func(x) * (1 - ctx.func(x))
+        return grad
+
+
 class nested_layer(PyLayer):
     @staticmethod
     def forward(ctx, x1, x2):
@@ -92,9 +167,9 @@ def backward(ctx, dy):
 
 
 class SimpleNet_1(paddle.nn.Layer):
-    def __init__(self):
+    def __init__(self, in_size, out_size):
         super().__init__()
-        self.linear = paddle.nn.Linear(4, 8)
+        self.linear = paddle.nn.Linear(in_size, out_size)
 
     @paddle.jit.to_static
     def forward(self, data):
@@ -103,6 +178,30 @@ def forward(self, data):
         return z
 
 
+class SimpleNet_2(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    def forward(self, x):
+        y = self.linear(x)
+        out = cus_tanh_2.apply(y, func1=paddle.tanh)
+        return out
+
+
+class SimpleNet_3(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    def forward(self, x):
+        y = self.linear(x)
+        out = cus_sigmoid.apply(
+            y, func1=paddle.exp, func2=paddle.nn.functional.sigmoid
+        )
+        return out
+
+
 class SimpleNetInplace(paddle.nn.Layer):
     def __init__(self):
         super().__init__()
@@ -115,6 +214,48 @@ def forward(self, data):
         return z
 
 
+class SimplePyLayerNet(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        y = self.linear(x)
+        out = cus_tanh_2.apply(y, func1=paddle.tanh)
+        out = paddle.mean(out)
+        return out
+
+
+class SimplePyLayerNetMultiIn(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear1 = paddle.nn.Linear(in_size, out_size)
+        self.linear2 = paddle.nn.Linear(in_size, out_size)
+
+    @paddle.jit.to_static
+    def forward(self, x1, x2):
+        y1 = self.linear1(x1)
+        y2 = self.linear1(x2)
+        out = cus_tanh_2.apply(y1, func1=paddle.tanh)
+        out = out + y2
+        out = paddle.mean(out)
+        return out
+
+
+class SimplePyLayerNetStopGrad(paddle.nn.Layer):
+    def __init__(self, in_size, out_size):
+        super().__init__()
+        self.linear = paddle.nn.Linear(in_size, out_size)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        y = self.linear(x)
+        y.stop_gradient = True
+        out = cus_tanh_2.apply(y, func1=paddle.tanh)
+        return out
+
+
 class TestPyLayerBase(unittest.TestCase):
     def setUp(self):
         self.place = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
@@ -269,10 +410,69 @@ def test_func(x1, x2):
 
         self._run_and_compare(input1, input2)
 
+    def test_apply_kwargs_pylayer(self):
+        @paddle.jit.to_static
+        def test_func(x1, x2):
+            y = scaled_layer_2.apply(x1=x2, x2=x1)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = False
+
+        self._run_and_compare(input1, input2)
+
+    def test_non_variable_inputs(self):
+        @paddle.jit.to_static
+        def test_func(x):
+            y = cus_tanh_2.apply(x, func1=paddle.tanh)
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+
+        self._run_and_compare(input1)
+
+    def test_simple_pylayer_return_none_with_no_grad(self):
+        @paddle.jit.to_static
+        def test_func(input1, input2):
+            z = cus_tanh_3.apply(input1, input2, paddle.tanh, paddle.square)
+            z = z[2] + z[3]
+            return z
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input2 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = True
+
+        self._run_and_compare(input1, input2)
+
+    def test_non_variable_inputs_and_userdefined_call(self):
+        @paddle.jit.to_static
+        def test_func(input1):
+            y = cus_tanh_4.apply(
+                input1, func=user_defined_square, name="cus_tanh_test"
+            )
+            return y
+
+        self.dygraph_func = test_func
+
+        input1 = paddle.randn([2, 3]).astype("float32")
+        input1.stop_gradient = False
+
+        self._run_and_compare(input1)
+
 
 class TestPyLayerInsideNet(TestPyLayerBase):
     def test_single_in_single_out(self):
-        simple_net = SimpleNet_1()
+        simple_net = SimpleNet_1(in_size=4, out_size=8)
         self.dygraph_func = simple_net
 
         input1 = paddle.randn([3, 4]).astype("float32")
@@ -287,6 +487,142 @@ def test_inplace(self):
         input1.stop_gradient = False
         self._run_and_compare(input1)
 
+    def test_non_variable_args_pylayernet(self):
+        simple_net = SimplePyLayerNet(in_size=4, out_size=8)
+        self.dygraph_func = simple_net
+
+        input1 = paddle.randn([3, 4]).astype("float32")
+        input1.stop_gradient = False
+        self._run_and_compare(input1)
+
+    def test_pylayer_net_with_no_grad(self):
+        simple_net = SimplePyLayerNetMultiIn(in_size=4, out_size=8)
+        self.dygraph_func = simple_net
+
+        input1 = paddle.randn([3, 4]).astype("float32")
+        input2 = paddle.randn([3, 4]).astype("float32")
+        input1.stop_gradient = False
+        input2.stop_gradient = True
+        self._run_and_compare(input1, input2)
+
+
+class PyLayerTrainHelper(unittest.TestCase):
+    def setUp(self):
+        self.place = "gpu" if paddle.is_compiled_with_cuda() else "cpu"
+
+    def _run_train(self, to_static, layer_builder, build_strategy=None):
+        """
+        Tests model decorated by `dygraph_to_static_output` in static graph mode. For users, the model is defined in dygraph mode and trained in static graph mode.
+        """
+        paddle.jit.enable_to_static(to_static)
+
+        paddle.set_device(self.place)
+        np.random.seed(SEED)
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+        # net = self.build_layer()
+        net = layer_builder()
+        if to_static:
+            net = paddle.jit.to_static(net, build_strategy=build_strategy)
+
+        _, _, avg_loss = train(net)
+        return avg_loss.numpy()
+
+
+class TestTrainingPyLayer(PyLayerTrainHelper):
+    def test_tanh_pylayer(self):
+        build_layer = lambda: SimpleNet_2(784, 20)
+
+        static_loss = self._run_train(to_static=True, layer_builder=build_layer)
+        dygraph_loss = self._run_train(
+            to_static=False, layer_builder=build_layer
+        )
+
+        np.testing.assert_allclose(
+            static_loss,
+            dygraph_loss,
+            rtol=1e-05,
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
+        )
+
+    def test_sigmoid_pylayer(self):
+        build_layer = lambda: SimpleNet_3(784, 20)
+
+        static_loss = self._run_train(to_static=True, layer_builder=build_layer)
+        dygraph_loss = self._run_train(
+            to_static=False, layer_builder=build_layer
+        )
+
+        np.testing.assert_allclose(
+            static_loss,
+            dygraph_loss,
+            rtol=1e-05,
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
+        )
+
+    def test_pylayer_net_no_grad(self):
+        build_layer = lambda: SimplePyLayerNetStopGrad(784, 20)
+
+        static_loss = self._run_train(to_static=True, layer_builder=build_layer)
+        dygraph_loss = self._run_train(
+            to_static=False, layer_builder=build_layer
+        )
+
+        np.testing.assert_allclose(
+            static_loss,
+            dygraph_loss,
+            rtol=1e-05,
+            err_msg=f'static_loss: {static_loss} \n dygraph_loss: {dygraph_loss}',
+        )
+
+
+class TestPyLayerJitSaveLoad(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(
+            self.temp_dir.name, "test_pylayer/jit_save_model"
+        )
+        # enable dygraph mode
+        paddle.base.enable_dygraph()
+        # config seed
+        paddle.seed(SEED)
+        paddle.framework.random._manual_program_seed(SEED)
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def train_and_save_model(self, model_path=None):
+        layer = SimpleNet_1(784, 20)
+        example_inputs, layer, _ = train(layer)
+        final_model_path = model_path if model_path else self.model_path
+        orig_input_types = [type(x) for x in example_inputs]
+        paddle.jit.save(
+            layer=layer, path=final_model_path, input_spec=example_inputs
+        )
+        new_input_types = [type(x) for x in example_inputs]
+        self.assertEqual(orig_input_types, new_input_types)
+        return layer
+
+    def test_save_load(self):
+        # train and save model
+        train_layer = self.train_and_save_model()
+        # load model
+        loaded_layer = paddle.jit.load(self.model_path)
+        self.load_and_inference(train_layer, loaded_layer)
+
+    def load_and_inference(self, train_layer, infer_layer):
+        train_layer.eval()
+        infer_layer.eval()
+        # inference & compare
+        x = paddle.base.dygraph.to_variable(
+            np.random.random((1, 784)).astype('float32')
+        )
+        train_layer_result = train_layer(x).numpy()
+        infer_layer_result = infer_layer(x).numpy()
+
+        np.testing.assert_array_equal(train_layer_result, infer_layer_result)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/legacy_test/test_jit_save_load.py b/test/legacy_test/test_jit_save_load.py
index e2df76f4751946..71c5c06a716b24 100644
--- a/test/legacy_test/test_jit_save_load.py
+++ b/test/legacy_test/test_jit_save_load.py
@@ -301,7 +301,7 @@ def forward_general(self, x):
 def train(layer, input_size=784, label_size=1):
     # create optimizer
     sgd = paddle.optimizer.SGD(
-        learning_rate=0.01, parameter_list=layer.parameters()
+        learning_rate=0.01, parameters=layer.parameters()
     )
     # create data loader
     train_loader = base.io.DataLoader.from_generator(capacity=5)
@@ -316,7 +316,7 @@ def train(layer, input_size=784, label_size=1):
         cost = layer(img)
 
         loss = paddle.nn.functional.cross_entropy(
-            cost, label, reduction='none', use_softmax=False
+            cost, label, reduction='none', use_softmax=True
         )
         avg_loss = paddle.mean(loss)
 
diff --git a/test/legacy_test/test_program_prune_backward.py b/test/legacy_test/test_program_prune_backward.py
index 237684e3b0bd97..581635d5a68ada 100755
--- a/test/legacy_test/test_program_prune_backward.py
+++ b/test/legacy_test/test_program_prune_backward.py
@@ -81,6 +81,27 @@ def loss2(pred, label):
     return avg_loss
 
 
+def pylayer_net(use_feed=None):
+    x = paddle.static.data(name="x", shape=[-1, 4], dtype='float32')
+    label = paddle.static.data('label', shape=[-1, 1], dtype='int64')
+
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+    hidden = paddle.static.nn.fc(x=[y], size=4, activation="softmax")
+    loss = paddle.nn.functional.cross_entropy(
+        input=hidden, label=label, reduction='none', use_softmax=False
+    )
+    loss = paddle.mean(loss, name='mean_softmax_loss')
+    return loss
+
+
 def optimization_in_cond_net(with_optimize=False):
     x = paddle.static.data(name="x", shape=[-1, 4], dtype='float32')
     label = paddle.static.data('label', shape=[-1, 1], dtype='int64')
@@ -115,6 +136,31 @@ def loss2(opt, pred, label, with_optimize):
     return avg_loss
 
 
+def optimization_in_pylayer_net(with_optimize=False):
+    x = paddle.static.data(name="x", shape=[-1, 4], dtype='float32')
+    label = paddle.static.data('label', shape=[-1, 1], dtype='int64')
+
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+    hidden = 3 * y
+    loss = paddle.nn.functional.softmax_with_cross_entropy(
+        logits=hidden, label=label
+    )
+    loss = paddle.mean(loss, name='mean_softmax_loss')
+    sgd = paddle.optimizer.SGD(learning_rate=0.1)
+    if with_optimize:
+        sgd.minimize(loss)
+
+    return loss
+
+
 class TestProgramPruneBackward(unittest.TestCase):
     def program_compare(self, program_a, program_b):
         assert isinstance(
@@ -249,6 +295,19 @@ def optimizer():
                 method=cond_net, feed_dict=feed_dict, optimizer=optimizer
             )
 
+    def test_pylayer(self):
+        def optimizer():
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+            return optimizer
+
+        with self.program_scope_guard():
+            x_in = np.random.random(size=(10, 4)).astype('float32')
+            label_in = np.random.randint(1, size=(10, 1)).astype('int64')
+            feed_dict = {'x': x_in, 'label': label_in}
+            self.check_prune_correctness(
+                method=pylayer_net, feed_dict=feed_dict, optimizer=optimizer
+            )
+
     def test_optimization_in_cond(self):
         x_in = np.random.random(size=(10, 4)).astype('float32')
         label_in = np.random.randint(1, size=(10, 1)).astype('int64')
@@ -279,6 +338,36 @@ def test_optimization_in_cond(self):
         self.program_compare(test_prog_orig, test_prog_prune)
         self.assertEqual(loss_data_orig, loss_data_prune)
 
+    def test_optimization_in_pylayer(self):
+        x_in = np.random.random(size=(10, 4)).astype('float32')
+        label_in = np.random.randint(1, size=(10, 1)).astype('int64')
+        feed_dict = {'x': x_in, 'label': label_in}
+        with self.program_scope_guard():
+            loss = optimization_in_pylayer_net(False)
+            main_program = base.default_main_program()
+            test_prog_orig = main_program.clone(for_test=True)
+            place = core.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(base.default_startup_program())
+            (loss_data_orig,) = exe.run(
+                test_prog_orig, feed=feed_dict, fetch_list=[loss.name]
+            )
+
+        with self.program_scope_guard():
+            loss = optimization_in_pylayer_net(True)
+            main_program = base.default_main_program()
+            test_prog_prune = main_program.clone(for_test=True)
+
+            place = core.CPUPlace()
+            exe = base.Executor(place)
+            exe.run(base.default_startup_program())
+            (loss_data_prune,) = exe.run(
+                test_prog_prune, feed=feed_dict, fetch_list=[loss.name]
+            )
+
+        self.program_compare(test_prog_orig, test_prog_prune)
+        self.assertEqual(loss_data_orig, loss_data_prune)
+
     @contextlib.contextmanager
     def program_scope_guard(self):
         prog = base.Program()
diff --git a/test/legacy_test/test_prune.py b/test/legacy_test/test_prune.py
index 00b96074ab5c2e..91314d3c86b800 100644
--- a/test/legacy_test/test_prune.py
+++ b/test/legacy_test/test_prune.py
@@ -22,121 +22,82 @@
 from paddle.base import framework
 
 
-class TestPrune(unittest.TestCase):
-    def net(self):
-        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
-        x.desc.set_need_check_feed(False)
-        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
-        label.desc.set_need_check_feed(False)
-        y = paddle.static.nn.fc(x=[x], size=2, activation="softmax")
-        loss = paddle.nn.functional.cross_entropy(
-            input=y, label=label, reduction='none', use_softmax=False
-        )
-        loss = paddle.mean(x=loss)
-        return x, y, label, loss
-
-    def test_prune_with_input(self):
+class TestPruneBase(unittest.TestCase):
+    def run_net(self, net):
         program = framework.Program()
         startup_program = framework.Program()
-        block = program.global_block()
         with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+            ret = net()
+
+        return ret, program
+
+    def check_prune_with_input(
+        self,
+        program,
+        feeded_var_names,
+        targets,
+        ops_before_pruned,
+        ops_after_pruned,
+    ):
+        block = program.global_block()
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
         )
         pruned_program = program._prune_with_input(
-            feeded_var_names=[y.name, label.name], targets=[loss]
+            feeded_var_names=feeded_var_names, targets=targets
+        )
+        self.assertEqual(
+            len(pruned_program.global_block().ops), len(ops_after_pruned)
         )
-        self.assertEqual(len(pruned_program.global_block().ops), 2)
         self.assertEqual(
             [op.type for op in pruned_program.global_block().ops],
-            ["softmax_with_cross_entropy", "reduce_mean"],
+            ops_after_pruned,
         )
 
-    def test_prune(self):
-        program = framework.Program()
-        startup_program = framework.Program()
+    def check_prune(
+        self, program, targets, ops_before_pruned, ops_after_pruned
+    ):
         block = program.global_block()
-        with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
+        )
+        pruned_program = program._prune(targets=targets)
+        self.assertEqual(
+            len(pruned_program.global_block().ops), len(ops_after_pruned)
         )
-        pruned_program = program._prune(targets=[loss])
-        self.assertEqual(len(pruned_program.global_block().ops), 5)
         self.assertEqual(
             [op.type for op in pruned_program.global_block().ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_after_pruned,
         )
 
-    def test_prune_target_not_list(self):
-        program = framework.Program()
-        startup_program = framework.Program()
+    def check_prune_target_not_list(
+        self, program, targets, ops_before_pruned, ops_after_pruned
+    ):
         block = program.global_block()
-        with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
+        )
+        pruned_program = program._prune(targets=targets)
+        self.assertEqual(
+            len(pruned_program.global_block().ops), len(ops_after_pruned)
         )
-        pruned_program = program._prune(targets=loss)
-        self.assertEqual(len(pruned_program.global_block().ops), 5)
         self.assertEqual(
             [op.type for op in pruned_program.global_block().ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_after_pruned,
         )
 
-    def test_prune_target_none(self):
-        program = framework.Program()
-        startup_program = framework.Program()
+    def check_prune_target_none(self, program, ops_before_pruned):
         block = program.global_block()
-        with base.program_guard(program, startup_program):
-            (x, y, label, loss) = self.net()
-        self.assertEqual(len(block.ops), 5)
+        self.assertEqual(len(block.ops), len(ops_before_pruned))
         self.assertEqual(
             [op.type for op in block.ops],
-            [
-                "mul",
-                "elementwise_add",
-                "softmax",
-                "softmax_with_cross_entropy",
-                "reduce_mean",
-            ],
+            ops_before_pruned,
         )
         try:
             pruned_program = program._prune(targets=None)
@@ -147,6 +108,96 @@ def test_prune_target_none(self):
             )
 
 
+class TestPrune(TestPruneBase):
+    def net(self):
+        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+        x.desc.set_need_check_feed(False)
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+        label.desc.set_need_check_feed(False)
+        y = paddle.static.nn.fc(x=[x], size=2, activation="softmax")
+        loss = paddle.nn.functional.cross_entropy(
+            input=y, label=label, reduction='none', use_softmax=False
+        )
+        loss = paddle.mean(x=loss)
+        return x, y, label, loss
+
+    def test_prune_with_input(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = ["softmax_with_cross_entropy", "reduce_mean"]
+        (x, y, label, loss), program = self.run_net(self.net)
+
+        self.check_prune_with_input(
+            program,
+            [y.name, label.name],
+            [loss],
+            ops_before_pruned,
+            ops_after_pruned,
+        )
+
+    def test_prune(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        (x, y, label, loss), program = self.run_net(self.net)
+
+        self.check_prune(program, [loss], ops_before_pruned, ops_after_pruned)
+
+    def test_prune_target_not_list(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        (x, y, label, loss), program = self.run_net(self.net)
+
+        self.check_prune_target_not_list(
+            program, loss, ops_before_pruned, ops_after_pruned
+        )
+
+    def test_prune_target_none(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "softmax_with_cross_entropy",
+            "reduce_mean",
+        ]
+
+        (x, y, label, loss), program = self.run_net(self.net)
+        self.check_prune_target_none(program, ops_before_pruned)
+
+
 def mock(self, program, feed, fetch, optimize_ops):
     self.prune_called_times += 1
     return program
@@ -160,77 +211,83 @@ def _mock_guard(mock):
     base.Executor._prune_program = original
 
 
+def net1():
+    x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+    x.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w_param_attrs = base.ParamAttr(
+        name="fc_weight",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    y = paddle.static.nn.fc(
+        x=[x], size=2, activation="softmax", weight_attr=w_param_attrs
+    )
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=y, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=y, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    loss1.persistable = True
+    loss2.persistable = True
+    return x, y, label, loss1, loss2, w_param_attrs
+
+
+def net2():
+    x1 = paddle.static.data(name='x1', shape=[-1, 2], dtype='float32')
+    x1.desc.set_need_check_feed(False)
+    x2 = paddle.static.data(name='x2', shape=[-1, 2], dtype='float32')
+    x2.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w1_param_attrs = base.ParamAttr(
+        name="fc_weight1",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    w2_param_attrs = base.ParamAttr(
+        name="fc_weight2",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    y1 = paddle.static.nn.fc(
+        x=[x1], size=2, activation="softmax", weight_attr=w1_param_attrs
+    )
+    y2 = paddle.static.nn.fc(
+        x=[x2], size=2, activation="softmax", weight_attr=w2_param_attrs
+    )
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=y1, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=y2, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    return (
+        x1,
+        x2,
+        y1,
+        y2,
+        label,
+        loss1,
+        loss2,
+        w1_param_attrs,
+        w2_param_attrs,
+    )
+
+
 class TestExecutorRunAutoPrune(unittest.TestCase):
-    def net1(self):
-        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
-        x.desc.set_need_check_feed(False)
-        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
-        label.desc.set_need_check_feed(False)
-        w_param_attrs = base.ParamAttr(
-            name="fc_weight",
-            learning_rate=0.5,
-            initializer=paddle.nn.initializer.Constant(1.0),
-            trainable=True,
-        )
-        y = paddle.static.nn.fc(
-            x=[x], size=2, activation="softmax", weight_attr=w_param_attrs
-        )
-        loss1 = paddle.nn.functional.cross_entropy(
-            input=y, label=label, reduction='none', use_softmax=False
-        )
-        loss1 = paddle.mean(x=loss1)
-        loss2 = paddle.nn.functional.cross_entropy(
-            input=y, label=label, reduction='none', use_softmax=False
-        )
-        loss2 = paddle.mean(x=loss2)
-        loss1.persistable = True
-        loss2.persistable = True
-        return x, y, label, loss1, loss2, w_param_attrs
-
-    def net2(self):
-        x1 = paddle.static.data(name='x1', shape=[-1, 2], dtype='float32')
-        x1.desc.set_need_check_feed(False)
-        x2 = paddle.static.data(name='x2', shape=[-1, 2], dtype='float32')
-        x2.desc.set_need_check_feed(False)
-        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
-        label.desc.set_need_check_feed(False)
-        w1_param_attrs = base.ParamAttr(
-            name="fc_weight1",
-            learning_rate=0.5,
-            initializer=paddle.nn.initializer.Constant(1.0),
-            trainable=True,
-        )
-        w2_param_attrs = base.ParamAttr(
-            name="fc_weight2",
-            learning_rate=0.5,
-            initializer=paddle.nn.initializer.Constant(1.0),
-            trainable=True,
-        )
-        y1 = paddle.static.nn.fc(
-            x=[x1], size=2, activation="softmax", weight_attr=w1_param_attrs
-        )
-        y2 = paddle.static.nn.fc(
-            x=[x2], size=2, activation="softmax", weight_attr=w2_param_attrs
-        )
-        loss1 = paddle.nn.functional.cross_entropy(
-            input=y1, label=label, reduction='none', use_softmax=False
-        )
-        loss1 = paddle.mean(x=loss1)
-        loss2 = paddle.nn.functional.cross_entropy(
-            input=y2, label=label, reduction='none', use_softmax=False
-        )
-        loss2 = paddle.mean(x=loss2)
-        return (
-            x1,
-            x2,
-            y1,
-            y2,
-            label,
-            loss1,
-            loss2,
-            w1_param_attrs,
-            w2_param_attrs,
-        )
+    def setUp(self):
+        self.net1 = net1
+        self.net2 = net2
 
     def test_not_prune(self):
         """
diff --git a/test/legacy_test/test_static_pylayer.py b/test/legacy_test/test_static_pylayer.py
index 3a1634e92bf58b..8b193d6e087be7 100644
--- a/test/legacy_test/test_static_pylayer.py
+++ b/test/legacy_test/test_static_pylayer.py
@@ -16,6 +16,7 @@
 import unittest
 
 import numpy as np
+from legacy_test.test_prune import TestExecutorRunAutoPrune, TestPruneBase
 
 import paddle
 from paddle import base
@@ -27,6 +28,9 @@
 
 
 class TestStaticPyLayerInputOutput(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
     def test_return_single_var(self):
         """
         pseudocode:
@@ -34,8 +38,6 @@ def test_return_single_var(self):
         y = 3 * x
         """
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -65,8 +67,6 @@ def test_return_0d_tensor(self):
         y = 3 * x
         """
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -96,8 +96,6 @@ def test_0d_tensor_backward(self):
         dx = -5 * dy
         '''
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -132,8 +130,6 @@ def backward_fn(dy):
         self.assertEqual(x_grad.shape, ())
 
     def test_return_var_typle(self):
-        paddle.enable_static()
-
         def forward_fn(a, b):
             return 3 * a, -2 * b
 
@@ -168,8 +164,6 @@ def forward_fn(a, b):
         )
 
     def test_return_forward_none(self):
-        paddle.enable_static()
-
         input_shape = (1, 3)
 
         def forward_fn(x):
@@ -198,8 +192,6 @@ def test_wrong_structure_exception(self):
         wrong number of inputs and outputs returned by ``forward_fn`` and ``backward_fn``
         """
 
-        paddle.enable_static()
-
         def forward_fn(a, b):
             return 3 * a, -b, paddle.mean(b)
 
@@ -232,6 +224,9 @@ def backward_fn(daout, dbout):
 
 
 class TestControlFlowNestedStaticPyLayer(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
     def test_cond_inside_static_pylayer(self):
         """
         forward propagation:
@@ -256,8 +251,6 @@ def backward_fn(diout, daout):
                 return daout_scaled, daout * daout
         """
 
-        paddle.enable_static()
-
         def forward_fn(i, a):
             return i, paddle.static.nn.cond(
                 i < 5.0, lambda: paddle.add(a, a), lambda: paddle.subtract(a, a)
@@ -343,9 +336,10 @@ def backward_fn(diout, daout):
 
 
 class TestStaticPyLayerBackward(unittest.TestCase):
-    def test_identity_backward(self):
+    def setUp(self):
         paddle.enable_static()
 
+    def test_identity_backward(self):
         def forward_fn(x):
             return x
 
@@ -405,8 +399,6 @@ def test_static_pylayer_backward(self):
         dx = tanh(dy)
         '''
 
-        paddle.enable_static()
-
         def forward_fn(x):
             return 3 * x
 
@@ -455,5 +447,231 @@ def backward_fn(dy):
         )
 
 
+class TestStaticPyLayerPrune(TestPruneBase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def net(self):
+        def forward_fn(x):
+            y = 3 * x
+            return y
+
+        def backward_fn(dy):
+            grad = paddle.exp(dy)
+            return grad
+
+        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+        x.desc.set_need_check_feed(False)
+        hidden = paddle.static.nn.fc(x=[x], size=4, activation="softmax")
+        y = paddle.static.nn.static_pylayer(forward_fn, [hidden], backward_fn)
+        loss = paddle.mean(y)
+        return x, hidden, y, loss
+
+    def net_with_weight(self):
+        def forward_fn(x):
+            y = 3 * x
+            return y
+
+        def backward_fn(dy):
+            grad = paddle.exp(dy)
+            return grad
+
+        x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+        x.desc.set_need_check_feed(False)
+        label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+        label.desc.set_need_check_feed(False)
+        w_param_attrs = base.ParamAttr(
+            name="fc_weight",
+            learning_rate=0.5,
+            initializer=paddle.nn.initializer.Constant(1.0),
+            trainable=True,
+        )
+
+        y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+        hidden = paddle.static.nn.fc(
+            x=[y], size=4, activation="softmax", weight_attr=w_param_attrs
+        )
+        loss1 = paddle.nn.functional.cross_entropy(
+            input=hidden, label=label, reduction='none', use_softmax=False
+        )
+        loss1 = paddle.mean(x=loss1)
+        loss2 = paddle.nn.functional.cross_entropy(
+            input=hidden, label=label, reduction='none', use_softmax=False
+        )
+        loss2 = paddle.mean(x=loss2)
+        loss1.persistable = True
+        loss2.persistable = True
+
+        return x, hidden, label, loss1, loss2, w_param_attrs
+
+    def test_prune_with_input(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = ["pylayer", "reduce_mean"]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+
+        self.check_prune_with_input(
+            program, [hidden.name], [loss], ops_before_pruned, ops_after_pruned
+        )
+
+    def test_prune(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+
+        self.check_prune(program, [loss], ops_before_pruned, ops_after_pruned)
+
+    def test_prune_target_not_list(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        ops_after_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+        self.check_prune_target_not_list(
+            program, loss, ops_before_pruned, ops_after_pruned
+        )
+
+    def test_prune_target_none(self):
+        ops_before_pruned = [
+            "mul",
+            "elementwise_add",
+            "softmax",
+            "pylayer",
+            "reduce_mean",
+        ]
+
+        (x, hidden, y, loss), program = self.run_net(self.net)
+        self.check_prune_target_none(program, ops_before_pruned)
+
+
+def net_with_weight1():
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    x = paddle.static.data(name='x', shape=[-1, 2], dtype='float32')
+    x.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w_param_attrs = base.ParamAttr(
+        name="fc_weight",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+
+    y = paddle.static.nn.static_pylayer(forward_fn, [x], backward_fn)
+    hidden = paddle.static.nn.fc(
+        x=[y], size=4, activation="softmax", weight_attr=w_param_attrs
+    )
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=hidden, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=hidden, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    loss1.persistable = True
+    loss2.persistable = True
+
+    return x, hidden, label, loss1, loss2, w_param_attrs
+
+
+def net_with_weight2():
+    def forward_fn(x):
+        y = 3 * x
+        return y
+
+    def backward_fn(dy):
+        grad = paddle.exp(dy)
+        return grad
+
+    x1 = paddle.static.data(name='x1', shape=[-1, 2], dtype='float32')
+    x1.desc.set_need_check_feed(False)
+    x2 = paddle.static.data(name='x2', shape=[-1, 2], dtype='float32')
+    x2.desc.set_need_check_feed(False)
+    label = paddle.static.data(name="label", shape=[-1, 1], dtype="int64")
+    label.desc.set_need_check_feed(False)
+    w1_param_attrs = base.ParamAttr(
+        name="fc_weight1",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+    w2_param_attrs = base.ParamAttr(
+        name="fc_weight2",
+        learning_rate=0.5,
+        initializer=paddle.nn.initializer.Constant(1.0),
+        trainable=True,
+    )
+
+    y1 = paddle.static.nn.static_pylayer(forward_fn, [x1], backward_fn)
+    hidden1 = paddle.static.nn.fc(
+        x=[y1], size=4, activation="softmax", weight_attr=w1_param_attrs
+    )
+    y2 = paddle.static.nn.static_pylayer(forward_fn, [x2], backward_fn)
+    hidden2 = paddle.static.nn.fc(
+        x=[y2], size=4, activation="softmax", weight_attr=w2_param_attrs
+    )
+
+    loss1 = paddle.nn.functional.cross_entropy(
+        input=hidden1, label=label, reduction='none', use_softmax=False
+    )
+    loss1 = paddle.mean(x=loss1)
+    loss2 = paddle.nn.functional.cross_entropy(
+        input=hidden2, label=label, reduction='none', use_softmax=False
+    )
+    loss2 = paddle.mean(x=loss2)
+    loss1.persistable = True
+    loss2.persistable = True
+
+    return x1, x2, y1, y2, label, loss1, loss2, w1_param_attrs, w2_param_attrs
+
+
+class TestStaticPyLayerExecutorAutoPrune(TestExecutorRunAutoPrune):
+    def setUp(self):
+        paddle.enable_static()
+        self.net1 = net_with_weight1
+        self.net2 = net_with_weight2
+
+
 if __name__ == '__main__':
     unittest.main()