diff --git a/paddle/fluid/framework/op_compatible_info.cc b/paddle/fluid/framework/op_compatible_info.cc
index 37c8dd22c174a..d710e76c0c111 100644
--- a/paddle/fluid/framework/op_compatible_info.cc
+++ b/paddle/fluid/framework/op_compatible_info.cc
@@ -79,8 +79,6 @@ void OpCompatibleMap::InitOpCompatibleMap() {
   op_compatible_map_["dpsgd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["eye"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["fill_any_like"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["filter_by_instag"] = {"1.6.0",
-                                            OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["hard_swish"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["gather_nd"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["instance_norm"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
@@ -91,7 +89,6 @@ void OpCompatibleMap::InitOpCompatibleMap() {
   op_compatible_map_["multiclass_nms2"] = {"1.6.0",
                                            OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["one_hot_v2"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
-  op_compatible_map_["prroi_pool"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["pull_box_sparse"] = {"1.6.0",
                                            OpCompatibleType::DEFIN_NOT};
   op_compatible_map_["scatter_nd_add"] = {"1.6.0", OpCompatibleType::DEFIN_NOT};
diff --git a/paddle/fluid/operators/filter_by_instag_op.cc b/paddle/fluid/operators/filter_by_instag_op.cc
deleted file mode 100644
index 02735a1ee5be0..0000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.cc
+++ /dev/null
@@ -1,181 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/var_type_inference.h"
-
-namespace paddle {
-namespace operators {
-class FilterByInstagOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Ins"),
-        true,
-        platform::errors::InvalidArgument("Input(Ins) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Ins_tag"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Ins_tag) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Filter_tag"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(Filter_tag) should be not null."));
-
-    PADDLE_ENFORCE_EQ(
-        ctx->HasOutput("Out"),
-        true,
-        platform::errors::InvalidArgument("Output(Out) should be not null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("LossWeight"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(LossWeight) shoudl not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput("IndexMap"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Output(IndexMap) should be not null."));
-
-    auto x1_dims = ctx->GetInputDim("Ins");  // batch_size * vec
-
-    ctx->SetOutputDim("Out", phi::make_ddim({-1, x1_dims[1]}));
-    ctx->SetOutputDim("LossWeight", phi::make_ddim({-1, 1}));
-    ctx->SetOutputDim("IndexMap", phi::make_ddim({-1, 2}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "Ins");
-    return phi::KernelKey(data_type, ctx.device_context().GetPlace());
-  }
-};
-
-class FilterByInstagOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Ins", "(phi::DenseTensor) embeded tensor");
-    AddInput("Ins_tag", "(phi::DenseTensor) ins tag list");
-    AddInput("Filter_tag", "(1D Tensor) filter tag list");
-    AddAttr<bool>("is_lod", "is Ins with LoD info or not, default True");
-    AddAttr<int64_t>("out_val_if_empty",
-                     "if the output after filter is empty, the output value")
-        .SetDefault(0);
-    AddOutput("Out", "(phi::DenseTensor) embeded tensor filtered by instag");
-    AddOutput("LossWeight", "(Tensor) loss weight.");
-    AddOutput("IndexMap",
-              "(phi::DenseTensor) mapping from Out rows to X1 rows");
-    AddComment(R"DOC(
-Filter By Instag Op
-
-This operator is used to filter embeded ins.
-
-There are 3 inputs. First is embeded ins, Second is tags for ins,
-Third is tags to filter.
-
-There are 3 outputs. First is filtered embeded ins, Second is Loss Weight,
-Third is the IndexMap from Out line number to X1 line number.
-)DOC");
-  }
-};
-
-class FilterByInstagOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("IndexMap"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(IndexMap) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Grad Input(Out) should be not null"));
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("Ins"),
-        true,
-        platform::errors::InvalidArgument("Input(Ins) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("LossWeight"),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input(LossWeight) should be not null"));
-    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Ins")),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Grad Output(Ins) should be not null"));
-
-    auto grad_out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x1_dims = ctx->GetInputDim("Ins");
-    ctx->SetOutputDim(framework::GradVarName("Ins"),
-                      phi::make_ddim({x1_dims[0], grad_out_dims[1]}));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    auto data_type = OperatorWithKernel::IndicateVarDataType(
-        ctx, framework::GradVarName("Out"));
-    return phi::KernelKey(data_type, ctx.device_context().GetPlace());
-  }
-};
-
-template <typename T>
-class FilterByInstagGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("filter_by_instag_grad");
-    op->SetInput("IndexMap", this->Output("IndexMap"));
-    op->SetInput("Ins", this->Input("Ins"));
-    op->SetAttrMap(this->Attrs());
-    op->SetInput("LossWeight", this->Output("LossWeight"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("Ins"), this->InputGrad("Ins"));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(filter_by_instag,
-                  ops::FilterByInstagOp,
-                  ops::FilterByInstagOpMaker,
-                  ops::FilterByInstagGradOpMaker<paddle::framework::OpDesc>,
-                  ops::FilterByInstagGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(filter_by_instag_grad, ops::FilterByInstagOpGrad);
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagGradKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/filter_by_instag_op.cu b/paddle/fluid/operators/filter_by_instag_op.cu
deleted file mode 100644
index 4449044acb89b..0000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.cu
+++ /dev/null
@@ -1,639 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-// #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
-
-#include "paddle/fluid/operators/filter_by_instag_op.h"
-
-#if defined(PADDLE_WITH_CUDA)
-#include <cooperative_groups.h>
-#endif
-
-#include <thrust/copy.h>
-#include <thrust/device_vector.h>
-
-#include <cstring>
-#include <random>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/phi/core/mixed_vector.h"
-
-#if defined(PADDLE_WITH_CUDA)
-namespace cg = cooperative_groups;
-#endif
-
-namespace paddle {
-namespace operators {
-
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-using Vector = phi::Vector<T>;
-
-#define WARP_SIZE 32
-#define MAX_WARP_NUM 32
-
-#if defined(PADDLE_WITH_CUDA)
-
-template <typename T>
-__global__ void filter_copy_fuse_kernel(const size_t N,
-                                        const int ins_per_thread,
-                                        size_t* x1_lods_data,
-                                        size_t* x2_lods_data,
-                                        const int64_t* x2_data,
-                                        const int64_t* x3_data,
-                                        int64_t filter_tag_size,
-                                        T* out_data,
-                                        int64_t* map_data,
-                                        size_t* map_lods_data,
-                                        size_t* out_lods_data,
-                                        size_t* out_idx_data,
-                                        const T* x1_data,
-                                        int x1_embed_size,
-                                        float* loss_weight_data,
-                                        float fill_value) {
-  // N is instance num
-  // one threads for ins_per_thread instances
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-
-  cg::thread_block b = cg::this_thread_block();
-  cg::thread_block_tile<WARP_SIZE> g = cg::tiled_partition<WARP_SIZE>(b);
-
-  int gid = idx / WARP_SIZE;
-
-  // general use
-  int thread_num =
-      (N + (ins_per_thread - 1)) / ins_per_thread;  // real thread num
-  int total_warp_num = thread_num / WARP_SIZE;      // 30
-  int remain_thread_num = thread_num % WARP_SIZE;   // 16
-
-  int warp_thread_num = -1;
-  if (gid < total_warp_num) {
-    warp_thread_num = WARP_SIZE;
-  } else {
-    warp_thread_num = remain_thread_num;
-  }
-
-  int group_num = total_warp_num;
-  if (remain_thread_num > 0) {
-    group_num = total_warp_num + 1;
-  }
-
-  if (gid >= group_num) return;
-
-  int ins_start = idx * ins_per_thread;
-  int ins_end = (idx + 1) * ins_per_thread;
-
-  if (N < ins_end) ins_end = N;
-
-  int flag_data[5];
-  int prefix_sum_data[5];
-  int prefix_sum_data2[5];
-
-  __shared__ int shr[MAX_WARP_NUM];
-  __shared__ int shr2[MAX_WARP_NUM];
-  __shared__ int shr3[MAX_WARP_NUM];
-
-  for (int p = ins_start; p < ins_end; p++) {
-    int ins_tag_start = x2_lods_data[p];
-    int ins_tag_end = x2_lods_data[p + 1];
-    flag_data[p - ins_start] = 0;
-    // filter logic
-    int i = ins_tag_start;
-    for (; i < ins_tag_end; i++) {
-      int64_t ins_tag = x2_data[i];
-      int j = 0;
-      for (; j < filter_tag_size; j++) {
-        if (x3_data[j] == ins_tag) break;
-      }
-      // if ins_tag in filter tag
-      if (j < filter_tag_size) {
-        flag_data[p - ins_start] = 1;
-        break;
-      }
-    }
-  }
-
-  int sum_addr = 0;
-  int sum_flag = 0;
-  int sum_out_lods = 0;
-
-  int local_addr = 0;
-  int local_flag = 0;
-  int local_out_lods = 0;
-
-  if (ins_start < ins_end) {
-    for (int p = ins_start; p < ins_end; p++) {
-      int previous = -1;
-      if (p == ins_start) {
-        previous = 0;
-      } else {
-        previous = prefix_sum_data[p - ins_start - 1];
-      }
-
-      prefix_sum_data[p - ins_start] =
-          previous +
-          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
-    }
-
-    local_addr = prefix_sum_data[ins_end - 1 - ins_start];
-    sum_addr = local_addr;
-
-    for (int p = ins_start; p < ins_end; p++) {
-      local_flag += flag_data[p - ins_start];
-    }
-    sum_flag = local_flag;
-
-    for (int p = ins_start; p < ins_end; p++) {
-      local_out_lods +=
-          flag_data[p - ins_start] * (x1_lods_data[p + 1] - x1_lods_data[p]);
-    }
-
-    sum_out_lods = local_out_lods;
-  }
-
-  for (int i = 1; i < warp_thread_num; i *= 2) {
-    int temp_addr = g.shfl_up(sum_addr, i);
-    int temp_flag = g.shfl_up(sum_flag, i);
-    int temp_out_lods = g.shfl_up(sum_out_lods, i);
-
-    if (g.thread_rank() >= i) {
-      sum_addr += temp_addr;
-      sum_flag += temp_flag;
-      sum_out_lods += temp_out_lods;
-    }
-  }
-
-  if (g.thread_rank() == warp_thread_num - 1) {
-    shr[gid] = sum_addr;
-    shr2[gid] = sum_flag;
-    shr3[gid] = sum_out_lods;
-  }
-
-  b.sync();
-
-  int sum_addr2 = 0;
-  int sum_flag2 = 0;
-  int sum_out_lods2 = 0;
-
-  // communicate between warp
-  if (g.thread_rank() < group_num) {
-    sum_addr2 = shr[g.thread_rank()];
-    sum_flag2 = shr2[g.thread_rank()];
-    sum_out_lods2 = shr3[g.thread_rank()];
-  }
-
-  for (int i = 1; i < group_num; i *= 2) {
-    int temp_addr2 = g.shfl_up(sum_addr2, i);
-    int temp_flag2 = g.shfl_up(sum_flag2, i);
-    int temp_out_lods2 = g.shfl_up(sum_out_lods2, i);
-
-    if (g.thread_rank() >= i) {
-      sum_addr2 += temp_addr2;
-      sum_flag2 += temp_flag2;
-      sum_out_lods2 += temp_out_lods2;
-    }
-  }
-
-  int sum_addr3 = g.shfl(sum_addr2, gid);
-  int sum_flag3 = g.shfl(sum_flag2, gid);
-  int sum_out_lods3 = g.shfl(sum_out_lods2, gid);
-
-  int p_flag;
-  int p_addr;
-  int p_out_lods;
-
-  if (ins_start < ins_end) {
-    p_addr = sum_addr3 - shr[gid] + sum_addr - local_addr;
-    p_flag = sum_flag3 - shr2[gid] + sum_flag - local_flag;
-    p_out_lods = sum_out_lods3 - shr3[gid] + sum_out_lods - local_out_lods;
-
-    for (int p = ins_start; p < ins_end; p++) {
-      if (ins_start == p) {
-        prefix_sum_data2[p - ins_start] = p_addr;
-      } else {
-        prefix_sum_data2[p - ins_start] =
-            prefix_sum_data2[p - ins_start - 1] +
-            flag_data[p - ins_start - 1] *
-                (x1_lods_data[p] - x1_lods_data[p - 1]);
-      }
-    }
-
-    if (gid == 0 && g.thread_rank() == group_num - 1) {
-      *out_idx_data = (sum_flag2 + 1);
-      map_lods_data[sum_flag2] = sum_flag2;
-    }
-  }
-
-  int sum_out_lods4 = g.shfl(sum_out_lods2 + 1, group_num - 1);
-
-  if (ins_start < ins_end) {
-    int out_lods_idx = p_flag + 1;
-    for (int p = ins_start; p < ins_end; p++) {
-      if (flag_data[p - ins_start] == 1) {
-        size_t batch_len = x1_lods_data[p + 1] - x1_lods_data[p];
-        int t = out_lods_idx - 1;
-        int previous;
-        if (out_lods_idx == p_flag + 1) {
-          previous = p_out_lods;
-        } else {
-          previous = out_lods_data[t];
-        }
-        map_data[t * 3] = (int64_t)previous;
-        map_data[t * 3 + 1] = x1_lods_data[p];
-        map_lods_data[t] = t;
-        out_lods_data[out_lods_idx] = previous + batch_len;
-        map_data[t * 3 + 2] = batch_len;
-        out_lods_idx++;
-      }
-    }
-
-    // fill loss_weight_data
-    if (sum_out_lods4 > 1) {
-      int out_data_num = sum_out_lods4 - 1;
-      int out_start = ins_start;
-      if (out_start < out_data_num) {
-        int out_end = ins_end >= out_data_num ? out_data_num : ins_end;
-        for (int p = out_start; p < out_end; p++) {
-          loss_weight_data[p] = fill_value;
-        }
-      }
-    }
-
-    for (int p = ins_start; p < ins_end; p++) {
-      // copy logic
-      if (flag_data[p - ins_start] == 1) {
-        auto output_start_idx = prefix_sum_data2[p - ins_start];
-        T* dst = out_data + output_start_idx * x1_embed_size;
-        const T* src_start = x1_data + x1_lods_data[p] * x1_embed_size;
-        const T* src_end = x1_data + x1_lods_data[p + 1] * x1_embed_size;
-        for (const T* j = src_start; j != src_end; dst++, j++) {
-          *dst = *j;
-        }
-      }
-    }
-  }
-
-  b.sync();
-}
-
-template <typename T>
-__global__ void copy_grad_kernel(const size_t N,
-                                 const int ins_per_thread,
-                                 const T* out_grad_data,
-                                 T* x1_grad_data,
-                                 const int64_t* map_data,
-                                 int x1_embed_size) {
-  // N is instance num
-  // one threads for one instance
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  int ins_start = idx * ins_per_thread;
-  int ins_end = (idx + 1) * ins_per_thread;
-  if (ins_start >= N) {
-    return;
-  }
-  if (ins_end > N) ins_end = N;
-  for (int p = ins_start; p < ins_end; p++) {
-    T* dst = x1_grad_data + map_data[p * 3 + 1] * x1_embed_size;
-    const T* src_start = out_grad_data + map_data[p * 3] * x1_embed_size;
-    const T* src_end =
-        out_grad_data + (map_data[p * 3] + map_data[p * 3 + 2]) * x1_embed_size;
-
-    for (const T* j = src_start; j != src_end; dst++, j++) {
-      *dst = *j;
-    }
-  }
-}
-
-#endif
-
-template <typename T, typename DeviceContext>
-class FilterByInstagGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-#if defined(PADDLE_WITH_CUDA)
-
-    auto gpu_place = context.GetPlace();
-
-    gpuStream_t current_stream = context.cuda_device_context().stream();
-
-    int max_thread_num_per_block = 1024;
-    //    context.cuda_device_context().GetMaxThreadsPerBlock();
-    // X1 is global FC output
-    // Dim [batch size, embedding size]
-    const phi::DenseTensor* x1 = context.Input<phi::DenseTensor>("Ins");
-    bool is_lod = context.Attr<bool>("is_lod");
-
-    int is_x1_lod = -1;
-    if (is_lod)
-      is_x1_lod = 1;
-    else
-      is_x1_lod = 0;
-
-    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
-    size_t x1_embed_size = x1->dims()[1];
-    // X2 is ins tag list
-    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
-    const phi::DenseTensor* x2 = context.Input<phi::DenseTensor>("Ins_tag");
-    // expected auto = const int64_t
-    const int64_t* x2_data = x2->data<int64_t>();
-
-    // X3 is local fc tag list
-    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    const phi::DenseTensor* x3 = context.Input<phi::DenseTensor>("Filter_tag");
-    const int64_t* x3_data = x3->data<int64_t>();
-
-    Vector<size_t> x2_lods;
-    if (x2->lod().size() != 0) {  // lod_level = 1
-      x2_lods = x2->lod()[0];
-    } else {  // lod_level = 0
-      const size_t x2_lods_size = x2->dims()[0];
-      const size_t instag_per_num = x2->dims()[1];
-      // x2_lods.resize(x2->dims()[0] + 1);
-      // move to cuda
-      x2_lods.push_back(0);
-      for (size_t i = 0; i < x2_lods_size; i++) {
-        x2_lods.push_back(x2_lods.back() + instag_per_num);
-      }
-    }
-
-    const size_t x2_lods_size = x2_lods.size() - 1;
-    phi::MixVector<size_t> mixv_x2_lods(&x2_lods);
-
-    size_t* x2_lods_data = mixv_x2_lods.CUDAMutableData(gpu_place);
-
-    Vector<size_t> x1_lods;
-    if (!is_x1_lod) {
-      x1_lods.push_back(0);
-      for (int i = 0; i < x1->dims()[0]; i++) {
-        x1_lods.push_back(i + 1);
-      }
-    } else {
-      // x1_lods = context.Input<phi::DenseTensor>("Ins")->lod()[0];
-      // new: lod_level=0 => lod() return {}
-      if (x1->lod().size() != 0) {  // lod_level = 1
-        x1_lods = x1->lod()[0];
-      } else {  // lod_level = 0
-        // x1_lods.resize(x1->dims()[0] + 1);
-        // move to cuda
-        x1_lods.push_back(0);
-        for (int i = 0; i < x1->dims()[0]; i++) {
-          x1_lods.push_back(i + 1);
-        }
-      }
-    }
-
-    phi::MixVector<size_t> mixv_x1_lods(&x1_lods);
-
-    size_t* x1_lods_data = mixv_x1_lods.CUDAMutableData(gpu_place);
-    auto* x1_data = x1->data<T>();
-
-    // set output value
-    // for those whose ins been dropout, set 0 for whole lines.
-    // otherwise, copy whole line
-    // Dim [local fc count, batch size, embedding size]
-    phi::DenseTensor* out = context.Output<phi::DenseTensor>("Out");
-    phi::DenseTensor* map = context.Output<phi::DenseTensor>("IndexMap");
-    phi::DenseTensor* loss_weight =
-        context.Output<phi::DenseTensor>("LossWeight");
-
-    int out_first = x1_lods.back();
-
-    out->Resize(phi::make_ddim({(int64_t)out_first, (int64_t)x1_embed_size}));
-    map->Resize(phi::make_ddim({(int64_t)x2_lods_size, 3}));
-    loss_weight->Resize(phi::make_ddim({(int64_t)x2_lods_size, 1}));
-
-    T* out_data = out->mutable_data<T>(gpu_place);
-    int64_t* map_data = map->mutable_data<int64_t>(gpu_place);
-    float* loss_weight_data = loss_weight->mutable_data<float>(gpu_place);
-
-    int block_size = max_thread_num_per_block;
-    int ins_per_thread = (x2_lods_size + block_size - 1) / block_size;
-    dim3 block_dim(block_size);
-    dim3 grid_dim(1);
-
-    Vector<size_t> out_lods(x2_lods_size + 1, 0);
-    Vector<size_t> map_lods(x2_lods_size + 1, 0);
-
-    phi::MixVector<size_t> mixv_out_lods(&out_lods);
-    phi::MixVector<size_t> mixv_map_lods(&map_lods);
-
-    // thrust::device_vector<size_t> out_idx(1);
-    Vector<size_t> out_idx(1, 0);
-    phi::MixVector<size_t> mixv_out_idx(&out_idx);
-
-    size_t* out_idx_data = mixv_out_idx.CUDAMutableData(gpu_place);
-    size_t* out_lods_data = mixv_out_lods.CUDAMutableData(gpu_place);
-    size_t* map_lods_data = mixv_map_lods.CUDAMutableData(gpu_place);
-
-    float fill_value = 1.0;
-
-    filter_copy_fuse_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
-        x2_lods_size,
-        ins_per_thread,
-        x1_lods_data,
-        x2_lods_data,
-        x2_data,
-        x3_data,
-        x3->numel(),
-        out_data,
-        map_data,
-        map_lods_data,
-        out_lods_data,
-        out_idx_data,
-        x1_data,
-        x1_embed_size,
-        loss_weight_data,
-        fill_value);
-
-    platform::GpuStreamSync(current_stream);
-
-    mixv_out_lods.resize(mixv_out_idx[0]);
-
-    if (mixv_out_lods.size() - 1 > 0) {
-      out->Resize(phi::make_ddim(
-          {(int64_t)mixv_out_lods.back(), (int64_t)x1_embed_size}));
-
-      map->Resize(phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 3}));
-      loss_weight->Resize(
-          phi::make_ddim({(int64_t)mixv_out_lods.size() - 1, 1}));
-
-    } else {
-      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
-      map->Resize(phi::make_ddim({1, 3}));
-      loss_weight->Resize(phi::make_ddim({1, 1}));
-    }
-
-    if (mixv_out_lods.size() - 1 > 0) {
-      map_lods.resize(mixv_out_lods.size());
-
-      mixv_map_lods.CopyToCPU();
-
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.emplace_back(map_lods);
-
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-
-      mixv_out_lods.CopyToCPU();
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.emplace_back(out_lods);
-      out->set_lod(out_lod_info);
-
-    } else {
-      Vector<size_t> map_lods(2, 0);
-      phi::MixVector<size_t> mixv_map_lods(&map_lods);
-      thrust::device_ptr<int64_t> map_data_ptr(map_data);
-
-      map_data_ptr[0] = 0;
-      map_data_ptr[1] = 1;
-      map_data_ptr[2] = 1;
-
-      mixv_map_lods[0] = 0;
-      mixv_map_lods[1] = 1;
-      mixv_out_lods.push_back(1);
-
-      mixv_map_lods.CopyToCPU();
-      mixv_out_lods.CopyToCPU();
-
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.emplace_back(map_lods);
-      map->set_lod(map_lod_info);
-
-      loss_weight->set_lod(map_lod_info);
-
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.emplace_back(out_lods);
-      out->set_lod(out_lod_info);
-
-      thrust::device_ptr<T> out_data_ptr(out_data);
-
-      // gpu kernel
-      if (std::is_same<T, int32_t>::value) {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<int32_t>(out_val_if_empty));
-      } else if (std::is_same<T, int64_t>::value) {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<int64_t>(out_val_if_empty));
-      } else if (std::is_same<T, float>::value) {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<float>(out_val_if_empty));
-      } else {
-        thrust::fill(out_data_ptr,
-                     out_data_ptr + out->numel(),
-                     static_cast<double>(out_val_if_empty));
-      }
-
-      thrust::device_ptr<float> loss_weight_data_ptr(loss_weight_data);
-      loss_weight_data_ptr[0] = 0;
-    }
-
-#endif
-  }
-};
-
-template <typename T, typename DeviceContext>
-class FilterByInstagGradGPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-#if defined(PADDLE_WITH_CUDA)
-
-    auto gpu_place = context.GetPlace();
-    gpuStream_t current_stream = context.cuda_device_context().stream();
-    auto max_thread_num_per_block = 1024;
-    auto* output_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x1_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Ins"));
-    auto* loss_weight = context.Input<phi::DenseTensor>("LossWeight");
-    auto* mmap = context.Input<phi::DenseTensor>("IndexMap");
-    auto* x1 = context.Input<phi::DenseTensor>("Ins");
-
-    x1_grad->set_lod(context.Input<phi::DenseTensor>("Ins")->lod());
-    x1_grad->Resize(x1->dims());
-
-    auto* mmap_data = mmap->data<int64_t>();
-    // expected auto = T
-    auto* output_grad_data = output_grad->data<T>();
-    auto* loss_weight_data = loss_weight->data<float>();
-
-    // expected auto = T
-    auto* x1_grad_data = x1_grad->mutable_data<T>(gpu_place);
-    thrust::device_ptr<T> x1_grad_data_ptr(x1_grad_data);
-    thrust::device_ptr<const float> loss_weight_data_ptr(loss_weight_data);
-
-    thrust::fill(
-        x1_grad_data_ptr, x1_grad_data_ptr + x1->dims()[0] * x1->dims()[1], 0);
-
-    if (loss_weight->numel() != 1 || loss_weight_data_ptr[0] != 0) {
-      auto output_dims = output_grad->dims();
-      int x1_embed_size = output_dims[1];
-
-      // one thread for multi-instances
-      int block_size = max_thread_num_per_block;
-
-      size_t N = mmap->dims()[0];
-      dim3 block_dim(block_size);
-
-      dim3 grid_dim((N + block_size - 1) / block_size);
-
-      const int ins_per_thread = 1;
-
-      copy_grad_kernel<<<grid_dim, block_dim, 0, current_stream>>>(
-          N,
-          ins_per_thread,
-          output_grad_data,
-          x1_grad_data,
-          mmap_data,
-          x1_embed_size);
-
-      cudaStreamSynchronize(current_stream);
-    }
-
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagGPUKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
-
-PD_REGISTER_STRUCT_KERNEL(filter_by_instag_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::FilterByInstagGradGPUKernel,
-                          float,
-                          double,
-                          int32_t,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/filter_by_instag_op.h b/paddle/fluid/operators/filter_by_instag_op.h
deleted file mode 100644
index 04dc713a4dcc9..0000000000000
--- a/paddle/fluid/operators/filter_by_instag_op.h
+++ /dev/null
@@ -1,231 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <cstring>
-#include <random>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/core/mixed_vector.h"
-
-namespace paddle {
-namespace operators {
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-using Vector = phi::Vector<T>;
-
-template <typename T, typename DeviceContext>
-class FilterByInstagKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    // X1 is global FC output
-    // Dim [batch size, embedding size]
-    auto* x1 = context.Input<phi::DenseTensor>("Ins");
-    bool is_x1_lod = context.Attr<bool>("is_lod");
-    int64_t out_val_if_empty = context.Attr<int64_t>("out_val_if_empty");
-    // X2 is ins tag list
-    // LoD [[0, Sum(ins1), Sum(ins1, ins2), ... ]]
-    auto* x2 = context.Input<phi::DenseTensor>("Ins_tag");
-    // X3 is local fc tag list
-    // LoD [[0, Sum(fc1), Sum(fc1, fc2) ...]]
-    auto* x3 = context.Input<phi::DenseTensor>("Filter_tag");
-
-    std::unordered_set<int64_t> filter_tag;
-    auto* x3_data = x3->data<int64_t>();
-    size_t len = x3->dims()[0];
-    for (size_t i = 0; i < len; i++) {
-      filter_tag.insert(x3_data[i]);
-    }
-
-    // expected auto = const int64_t
-    auto* x2_data = x2->data<int64_t>();
-    // e.g get [0, 1, 2, 3, ...]
-    // size_t x2_lods_size = x2->dims()[0];
-    // size_t instag_num_per_ins = x2->dims()[1];
-
-    Vector<size_t> x2_lods(1, 0);
-    if (x2->lod().size() != 0) {  // lod_level = 1
-      x2_lods = x2->lod()[0];
-    } else {  // lod_level = 0
-      const size_t x2_lods_size = x2->dims()[0];
-      const size_t instag_num_per_ins = x2->dims()[1];
-      for (size_t i = 0; i < x2_lods_size; i++) {
-        x2_lods.push_back(x2_lods.back() + instag_num_per_ins);
-      }
-    }
-
-    Vector<size_t> x1_lods(1, 0);
-    if (!is_x1_lod) {
-      for (int i = 0; i < x1->dims()[0]; i++) {
-        x1_lods.push_back(i + 1);
-      }
-    } else {
-      // new: lod_level=0 => lod() return {}
-      if (x1->lod().size() != 0) {
-        x1_lods = x1->lod()[0];
-      } else {
-        for (int i = 0; i < x1->dims()[0]; i++) {
-          x1_lods.push_back(i + 1);
-        }
-      }
-    }
-    std::unordered_map<int64_t, int64_t> mmap_aux;
-    Vector<size_t> out_lods(1, 0);
-    for (size_t i = 0; i < x2_lods.size() - 1; i++) {
-      for (size_t j = x2_lods[i]; j < x2_lods[i + 1]; j++) {
-        if (filter_tag.find(x2_data[j]) != filter_tag.end()) {
-          size_t batch_len = x1_lods[i + 1] - x1_lods[i];
-          mmap_aux[out_lods.back()] = x1_lods[i];
-          out_lods.push_back(out_lods.back() + batch_len);
-          break;
-        }
-      }
-    }
-    // set output value
-    // for those whose ins been dropout, set 0 for whole lines.
-    // otherwise, copy whole line
-    // Dim [local fc count, batch size, embedding size]
-    phi::DenseTensor* out = context.Output<phi::DenseTensor>("Out");
-    phi::DenseTensor* map = context.Output<phi::DenseTensor>("IndexMap");
-    phi::DenseTensor* loss_weight =
-        context.Output<phi::DenseTensor>("LossWeight");
-    // expected auto = const T
-    auto* x1_data = x1->data<T>();
-    // expected auto = T
-    size_t x1_embed_size = x1->dims()[1];
-    if (out_lods.size() - 1 > 0) {
-      out->Resize(
-          phi::make_ddim({(int64_t)out_lods.back(), (int64_t)x1_embed_size}));
-      map->Resize(phi::make_ddim({(int64_t)out_lods.size() - 1, 3}));
-      loss_weight->Resize(phi::make_ddim({(int64_t)out_lods.size() - 1, 1}));
-    } else {
-      out->Resize(phi::make_ddim({1, (int64_t)x1_embed_size}));
-      map->Resize(phi::make_ddim({1, 3}));
-      loss_weight->Resize(phi::make_ddim({1, 1}));
-    }
-    auto* out_data = out->mutable_data<T>(context.GetPlace());
-    auto* map_data = map->mutable_data<int64_t>(context.GetPlace());
-    auto* loss_weight_data =
-        loss_weight->mutable_data<float>(context.GetPlace());
-    if (out_lods.size() - 1 > 0) {
-      Vector<size_t> map_lods;
-      for (size_t i = 0; i < out_lods.size() - 1; i++) {
-        map_data[i * 3] = (int64_t)out_lods[i];
-        map_data[i * 3 + 1] = mmap_aux[map_data[i * 3]];
-        map_data[i * 3 + 2] = out_lods[i + 1] - out_lods[i];
-        map_lods.push_back(i);
-      }
-      map_lods.push_back(out_lods.size() - 1);
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      memset(out_data, 0, out->numel() * sizeof(T));
-      for (int i = 0; i < loss_weight->numel(); i++) {
-        loss_weight_data[i] = 1;
-      }
-
-      for (size_t i = 0; i < out_lods.size() - 1; i++) {
-        size_t pos = out_lods[i];
-        for (int k = map_data[i * 3 + 1];
-             k < map_data[i * 3 + 1] + map_data[i * 3 + 2];
-             k++) {
-          memcpy(out_data + pos * x1_embed_size,
-                 x1_data + k * x1_embed_size,
-                 x1_embed_size * sizeof(T));
-          ++pos;
-        }
-      }
-    } else {
-      Vector<size_t> map_lods;
-      map_data[0] = 0;
-      map_data[1] = 1;
-      map_data[2] = 1;
-      map_lods.push_back(0);
-      map_lods.push_back(1);
-      out_lods.push_back(1);
-      std::vector<Vector<size_t>> map_lod_info;
-      map_lod_info.push_back(map_lods);
-      map->set_lod(map_lod_info);
-      loss_weight->set_lod(map_lod_info);
-      std::vector<Vector<size_t>> out_lod_info;
-      out_lod_info.push_back(out_lods);
-      out->set_lod(out_lod_info);
-      for (int64_t oi = 0; oi < out->numel(); ++oi) {
-        if (std::is_same<T, int32_t>::value) {
-          out_data[oi] = (int32_t)out_val_if_empty;
-        } else if (std::is_same<T, int64_t>::value) {
-          out_data[oi] = (int64_t)out_val_if_empty;
-        } else if (std::is_same<T, double>::value) {
-          out_data[oi] = static_cast<double>(out_val_if_empty);
-        } else {
-          out_data[oi] = static_cast<float>(out_val_if_empty);
-        }
-      }
-      loss_weight_data[0] = 0;
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class FilterByInstagGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* output_grad =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x1_grad =
-        context.Output<phi::DenseTensor>(framework::GradVarName("Ins"));
-    auto* loss_weight = context.Input<phi::DenseTensor>("LossWeight");
-    auto* mmap = context.Input<phi::DenseTensor>("IndexMap");
-    auto* x1 = context.Input<phi::DenseTensor>("Ins");
-    x1_grad->set_lod(context.Input<phi::DenseTensor>("Ins")->lod());
-    x1_grad->Resize(x1->dims());
-    auto mmap_data = mmap->data<int64_t>();
-    // expected auto = T
-    auto* output_grad_data = output_grad->data<T>();
-
-    auto* loss_weight_data = loss_weight->data<float>();
-    // expected auto = T
-    auto* x1_grad_data = x1_grad->mutable_data<T>(context.GetPlace());
-    memset(x1_grad_data, 0, x1->dims()[0] * x1->dims()[1] * sizeof(T));
-    if (loss_weight->numel() != 1 || loss_weight_data[0] != 0) {
-      auto output_dims = output_grad->dims();
-      for (int i = 0; i < mmap->dims()[0]; i++) {
-        int src_ln = mmap_data[i * 3], dst_ln = mmap_data[i * 3 + 1];
-        int line_cnt = mmap_data[i * 3 + 2];
-        for (int l = 0; l < line_cnt; l++) {
-          for (int j = 0; j < output_dims[1]; j++) {
-            x1_grad_data[(dst_ln + l) * output_dims[1] + j] =
-                output_grad_data[(src_ln + l) * output_dims[1] + j];
-          }
-        }
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/prroi_pool_op.cc b/paddle/fluid/operators/prroi_pool_op.cc
deleted file mode 100644
index 0f0dbf3c6888a..0000000000000
--- a/paddle/fluid/operators/prroi_pool_op.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prroi_pool_op.h"
-
-#include <memory>
-
-namespace paddle {
-namespace operators {
-
-class PRROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor), "
-             "the input of PRROIPoolOp. "
-             "The format of input tensor is NCHW. Where N is the batch size, "
-             "C is the number of input channels, "
-             "H is the height of the input feature map, and "
-             "W is the width.");
-    AddInput("ROIs",
-             "(phi::DenseTensor), "
-             "ROIs (Regions of Interest) to pool over. "
-             "should be a 2-D phi::DenseTensor of shape (num_rois, 4) "
-             "given as [(x1, y1, x2, y2), ...]. "
-             "where (x1, y1) is the top left coordinates, and "
-             "(x2, y2) is the bottom right coordinates. "
-             "The roi batch index can be calculated from LoD.");
-    AddInput("BatchRoINums",
-             "(Tensor), "
-             "1-D tensor with shape [N], the number of"
-             " rois for each image in batch, where N is the batch size")
-        .AsDispensable();
-    AddOutput("Out",
-              "(Tensor), "
-              "the output of PRROIPoolOp is a 4-D Tensor with shape "
-              "(num_rois, output_channels, pooled_h, pooled_w).");
-    AddAttr<float>("spatial_scale",
-                   "(float, default 1.0), "
-                   "Multiplicative spatial scale factor "
-                   "to translate ROI coords from their input scale "
-                   "to the scale used when pooling.")
-        .SetDefault(1.0);
-    AddAttr<int>("pooled_height",
-                 "(int, default 1), "
-                 "the pooled output height.")
-        .SetDefault(1);
-    AddAttr<int>("pooled_width",
-                 "(int, default 1), "
-                 "the pooled output width.")
-        .SetDefault(1);
-    AddComment(R"Doc(
-**PRROIPool Operator**
-
-Precise region of interest pooling (also known as PRROIPooling) is to perform
- bilinear interpolation average pooling method for RoI Pooling.
-
-Please refer to https://arxiv.org/abs/1807.11590 for more details.
-
-    )Doc");
-  }
-};
-
-class PRROIPoolOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "prroi_pool");
-    OP_INOUT_CHECK(ctx->HasInput("ROIs"), "Input", "ROIs", "prroi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Input", "Out", "prroi_pool");
-
-    auto input_dims = ctx->GetInputDim("X");
-    auto rois_dims = ctx->GetInputDim("ROIs");
-
-    PADDLE_ENFORCE_EQ(input_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The format of input tensor is NCHW"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims.size(),
-        2,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    PADDLE_ENFORCE_EQ(
-        rois_dims[1],
-        4,
-        platform::errors::InvalidArgument(
-            "ROIs should be a 2-D phi::DenseTensor of shape (num_rois, 4) "
-            "given as [(x1, y1, x2, y2), ...]"));
-    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
-    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
-    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
-
-    PADDLE_ENFORCE_GT(pooled_height,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output height must be greater than 0"));
-    PADDLE_ENFORCE_GT(pooled_width,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The pooled output width must be greater than 0"));
-    PADDLE_ENFORCE_GT(spatial_scale,
-                      0.0f,
-                      platform::errors::InvalidArgument(
-                          "The spatial scale must greater than 0."));
-
-    auto out_dims = input_dims;
-    out_dims[0] = rois_dims[0];
-    out_dims[1] = input_dims[1];
-    out_dims[2] = pooled_height;
-    out_dims[3] = pooled_width;
-
-    if (ctx->HasInput("BatchRoINums")) {
-      auto rois_batch_index = ctx->GetInputDim("BatchRoINums");
-      PADDLE_ENFORCE_EQ(rois_batch_index[0],
-                        input_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The length of BatchRoINums should equal to  "
-                            "first dim of inputs(X)"));
-    }
-    ctx->SetOutputDim("Out", out_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class PRROIPoolGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "prroi_pool");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")),
-                   "Output",
-                   framework::GradVarName("X"),
-                   "prroi_pool");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-    ctx->SetOutputDim(framework::GradVarName("ROIs"), ctx->GetInputDim("ROIs"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class PRROIPoolGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("prroi_pool_grad");
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Out", this->Output("Out"));
-    op->SetInput("ROIs", this->Input("ROIs"));
-    op->SetInput("BatchRoINums", this->Input("BatchRoINums"));
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("ROIs"), this->InputGrad("ROIs"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(prroi_pool,
-                  ops::PRROIPoolOp,
-                  ops::PRROIPoolOpMaker,
-                  ops::PRROIPoolGradMaker<paddle::framework::OpDesc>,
-                  ops::PRROIPoolGradMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(prroi_pool_grad, ops::PRROIPoolGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(prroi_pool,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CPUPRROIPoolOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
-PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::CPUPRROIPoolGradOpKernel,
-                          float,
-                          double,
-                          int,
-                          int64_t) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
deleted file mode 100644
index 5d1243964279b..0000000000000
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ /dev/null
@@ -1,439 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/prroi_pool_op.h"
-
-namespace paddle {
-namespace operators {
-
-static constexpr int kNumCUDAThreads = 512;
-static constexpr int kNumMaximumNumBlocks = 4096;
-
-static inline int NumBlocks(const int N) {
-  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
-                  kNumMaximumNumBlocks);
-}
-
-template <typename T>
-__global__ void GPUPRROIPoolForward(const int nthreads,
-                                    const T* input_data,
-                                    const T* input_rois,
-                                    const float spatial_scale,
-                                    const int input_channels,
-                                    const int height,
-                                    const int width,
-                                    const int output_channels,
-                                    const int pooled_height,
-                                    const int pooled_width,
-                                    const int* rois_batch_id_data,
-                                    T* output_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (size_t i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-    T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-    T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-    T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    T win_start_w = roi_start_w + bin_size_w * pw;
-    T win_start_h = roi_start_h + bin_size_h * ph;
-    T win_end_w = win_start_w + bin_size_w;
-    T win_end_h = win_start_h + bin_size_h;
-
-    T win_size = max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-    int input_channel = c;
-    const T* offset_input_data =
-        input_data +
-        (roi_batch_id * input_channels + input_channel) * height * width;
-
-    if (win_size > static_cast<T>(0.0)) {
-      int s_w = floor(win_start_w);
-      int e_w = ceil(win_end_w);
-      int s_h = floor(win_start_h);
-      int e_h = ceil(win_end_h);
-      T sum_out = 0;
-
-      for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-        for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-          sum_out += PrRoIPoolingMatCalculation(
-              offset_input_data,
-              h_iter,
-              w_iter,
-              h_iter + 1,
-              w_iter + 1,
-              max(win_start_h, static_cast<T>(h_iter)),
-              max(win_start_w, static_cast<T>(w_iter)),
-              min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)),
-              min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)),
-              height,
-              width);
-        }
-      }
-      output_data[i] = sum_out / win_size;
-    } else {
-      output_data[i] = 0.;
-    }
-  }
-}
-
-template <typename T>
-__global__ void GPUPRROIPoolBackward(const int nthreads,
-                                     const T* in_data,
-                                     const T* input_rois,
-                                     const T* output_grad_data,
-                                     const float spatial_scale,
-                                     const int input_channels,
-                                     const int height,
-                                     const int width,
-                                     const int output_channels,
-                                     const int pooled_height,
-                                     const int pooled_width,
-                                     const int* rois_batch_id_data,
-                                     T* input_grad_data,
-                                     const T* out_data,
-                                     T* input_roi_grad_data) {
-  int index = blockIdx.x * blockDim.x + threadIdx.x;
-  int offset = blockDim.x * gridDim.x;
-  for (int i = index; i < nthreads; i += offset) {
-    // The output is in order (n, c, ph, pw)
-    int pw = i % pooled_width;
-    int ph = (i / pooled_width) % pooled_height;
-    int c = (i / pooled_width / pooled_height) % output_channels;
-    int n = i / pooled_width / pooled_height / output_channels;
-
-    // set roi_batch_id
-    int roi_batch_id = rois_batch_id_data[n];
-    int input_channel = c;
-    int input_offset =
-        (roi_batch_id * input_channels + input_channel) * height * width;
-    T* offset_input_grad_data = input_grad_data + input_offset;
-    const T* offset_output_grad_data = output_grad_data + i;
-
-    // [start, end) interval for spatial sampling
-    const T* offset_input_rois = input_rois + n * 4;
-    T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-    T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-    T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-    T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-    T* offset_input_roi_grad_data = input_roi_grad_data + n * 4;
-
-    T roi_width = max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-    T roi_height = max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-    // Compute w and h at input feature map
-    T bin_size_h = roi_height / static_cast<T>(pooled_height);
-    T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-    T win_start_w = roi_start_w + bin_size_w * pw;
-    T win_start_h = roi_start_h + bin_size_h * ph;
-    T win_end_w = win_start_w + bin_size_w;
-    T win_end_h = win_start_h + bin_size_h;
-
-    T win_size = max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-    int s_w = floor(win_start_w);
-    int e_w = ceil(win_end_w);
-    int s_h = floor(win_start_h);
-    int e_h = ceil(win_end_h);
-
-    T sum_out = win_size == static_cast<T>(0.)
-                    ? static_cast<T>(0.)
-                    : *offset_output_grad_data / win_size;
-
-    for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-      for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-        PrRoIPoolingMatDistributeDiff<T>(
-            offset_input_grad_data,
-            sum_out,
-            h_iter,
-            w_iter,
-            h_iter + 1,
-            w_iter + 1,
-            max(win_start_h, static_cast<T>(h_iter)),
-            max(win_start_w, static_cast<T>(w_iter)),
-            min(win_end_h, static_cast<T>(h_iter) + static_cast<T>(1.0)),
-            min(win_end_w, static_cast<T>(w_iter) + static_cast<T>(1.0)),
-            height,
-            width);
-      }
-    }
-
-    const T* offset_out_data = out_data + i;
-    const T* offset_in_data = in_data + input_offset;
-    PrRoIPoolingCoorBackward<T>(s_w,
-                                e_w,
-                                s_h,
-                                e_h,
-                                width,
-                                height,
-                                win_start_w,
-                                win_start_h,
-                                win_end_w,
-                                win_end_h,
-                                pw,
-                                ph,
-                                pooled_width,
-                                pooled_height,
-                                win_size,
-                                spatial_scale,
-                                offset_in_data,
-                                offset_out_data,
-                                offset_input_roi_grad_data,
-                                offset_output_grad_data);
-  }
-}
-
-template <typename T, typename DeviceContext>
-class GPUPRROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    auto output_channels = input_channels;
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-
-    // set rois batch id
-    phi::DenseTensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-    if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-      auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-      phi::DenseTensor batch_index_cpu;
-      framework::TensorCopySync(
-          *batchroinum, platform::CPUPlace(), &batch_index_cpu);
-
-      int rois_batch_size = batchroinum->dims()[0];
-      auto* batch_index = batch_index_cpu.data<int64_t>();
-      size_t c = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int64_t k = 0; k < batch_index[n]; ++k) {
-          rois_batch_id_data[c] = n;
-          c = c + 1;
-        }
-      }
-
-    } else {
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size,
-          batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and input(X) batch_size must be the same."));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num,
-          rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The rois_num from input and lod must be the same."));
-
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    int output_size = out->numel();
-    int blocks = NumBlocks(output_size);
-    int threads = kNumCUDAThreads;
-
-    auto cplace = platform::CPUPlace();
-    auto& dev_ctx = ctx.cuda_device_context();
-    int bytes = rois_batch_id_list.numel() * sizeof(int);
-    auto roi_ptr = memory::Alloc(
-        dev_ctx.GetPlace(),
-        bytes,
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-    int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-    const auto gplace = ctx.GetPlace();
-    memory::Copy(gplace,
-                 roi_id_data,
-                 cplace,
-                 rois_batch_id_data,
-                 bytes,
-                 dev_ctx.stream());
-
-    // call cuda kernel function
-    GPUPRROIPoolForward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-        output_size,
-        in->data<T>(),
-        rois->data<T>(),
-        spatial_scale,
-        input_channels,
-        height,
-        width,
-        output_channels,
-        pooled_height,
-        pooled_width,
-        roi_id_data,
-        out->mutable_data<T>(ctx.GetPlace()));
-  }
-};
-
-template <typename T, typename DeviceContext>
-class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* input_roi_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("ROIs"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    int rois_num = rois->dims()[0];
-    int input_channels = in->dims()[1];
-    auto output_channels = input_channels;
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (input_grad || input_roi_grad) {
-      // set roi batch id
-      phi::DenseTensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(platform::CPUPlace());
-
-      if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-        auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-        phi::DenseTensor batch_index_cpu;
-        framework::TensorCopySync(
-            *batchroinum, platform::CPUPlace(), &batch_index_cpu);
-
-        int rois_batch_size = batchroinum->dims()[0];
-        auto* batch_index = batch_index_cpu.data<int64_t>();
-        size_t c = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int64_t k = 0; k < batch_index[n]; ++k) {
-            rois_batch_id_data[c] = n;
-            c = c + 1;
-          }
-        }
-      } else {
-        PADDLE_ENFORCE_EQ(rois->lod().empty(),
-                          false,
-                          platform::errors::InvalidArgument(
-                              "the lod of Input ROIs should not be empty when "
-                              "BatchRoINums is None!"));
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      auto cplace = platform::CPUPlace();
-      auto& dev_ctx = ctx.cuda_device_context();
-      int bytes = rois_batch_id_list.numel() * sizeof(int);
-      auto roi_ptr = memory::Alloc(
-          dev_ctx.GetPlace(),
-          bytes,
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-      int* roi_id_data = reinterpret_cast<int*>(roi_ptr->ptr());
-      const auto gplace = ctx.GetPlace();
-      memory::Copy(gplace,
-                   roi_id_data,
-                   cplace,
-                   rois_batch_id_data,
-                   bytes,
-                   dev_ctx.stream());
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
-      input_roi_grad->mutable_data<T>(ctx.GetPlace());
-      set_zero(ctx.cuda_device_context(), input_roi_grad, static_cast<T>(0));
-
-      int output_grad_size = output_grad->numel();
-      int blocks = NumBlocks(output_grad_size);
-      int threads = kNumCUDAThreads;
-
-      if (output_grad_size > 0) {
-        GPUPRROIPoolBackward<T><<<blocks, threads, 0, dev_ctx.stream()>>>(
-            output_grad_size,
-            in->data<T>(),
-            rois->data<T>(),
-            output_grad->data<T>(),
-            spatial_scale,
-            input_channels,
-            height,
-            width,
-            output_channels,
-            pooled_height,
-            pooled_width,
-            roi_id_data,
-            input_grad->mutable_data<T>(ctx.GetPlace()),
-            out->data<T>(),
-            input_roi_grad->mutable_data<T>(ctx.GetPlace()));
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    prroi_pool, GPU, ALL_LAYOUT, ops::GPUPRROIPoolOpKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(prroi_pool_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::GPUPRROIPoolGradOpKernel,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
deleted file mode 100644
index e2417a071ce88..0000000000000
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ /dev/null
@@ -1,653 +0,0 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <algorithm>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-#if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/phi/backends/gpu/gpu_primitives.h"
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-inline HOSTDEVICE T PrRoIPoolingGetData(const T* data,
-                                        const int h,
-                                        const int w,
-                                        const int height,
-                                        const int width) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  T retVal = overflow ? 0.0f : data[h * width + w];
-  return retVal;
-}
-
-template <typename T>
-inline HOSTDEVICE T PrRoIPoolingMatCalculation(const T* this_data,
-                                               const int s_h,
-                                               const int s_w,
-                                               const int e_h,
-                                               const int e_w,
-                                               const T y0,
-                                               const T x0,
-                                               const T y1,
-                                               const T x1,
-                                               const int h0,
-                                               const int w0) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-  T sum_out = 0;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = y0 - static_cast<T>(s_h);
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = y1 - static_cast<T>(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = static_cast<T>(e_h) - y1;
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = static_cast<T>(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
-
-  return sum_out;
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-DEVICE void PrRoIPoolingDistributeDiff(T* diff,
-                                       const T top_diff,
-                                       const int h,
-                                       const int w,
-                                       const int height,
-                                       const int width,
-                                       const T coeff) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  if (!overflow) {
-    phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
-  }
-}
-#else
-template <typename T>
-inline HOSTDEVICE void PrRoIPoolingDistributeDiff(T* diff,
-                                                  const T top_diff,
-                                                  const int h,
-                                                  const int w,
-                                                  const int height,
-                                                  const int width,
-                                                  const T coeff) {
-  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
-  if (!overflow) {
-    *(diff + h * width + w) += top_diff * coeff;
-  }
-}
-#endif
-
-template <typename T>
-HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
-                                              const T top_diff,
-                                              const int s_h,
-                                              const int s_w,
-                                              const int e_h,
-                                              const int e_w,
-                                              const T y0,
-                                              const T x0,
-                                              const T y1,
-                                              const T x1,
-                                              const int h0,
-                                              const int w0) {
-  T alpha, beta, lim_alpha, lim_beta, tmp;
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = y0 - static_cast<T>(s_h);
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = y1 - static_cast<T>(s_h);
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, s_h, s_w, h0, w0, tmp);
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, s_h, e_w, h0, w0, tmp);
-
-  alpha = x0 - static_cast<T>(s_w);
-  beta = static_cast<T>(e_h) - y1;
-  lim_alpha = x1 - static_cast<T>(s_w);
-  lim_beta = static_cast<T>(e_h) - y0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, s_w, h0, w0, tmp);
-
-  alpha = static_cast<T>(e_w) - x1;
-  lim_alpha = static_cast<T>(e_w) - x0;
-  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
-         0.5f * alpha * alpha) *
-        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
-  PrRoIPoolingDistributeDiff<T>(diff, top_diff, e_h, e_w, h0, w0, tmp);
-}
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-DEVICE void AccumulateRois(T* offset, T data) {
-  phi::CudaAtomicAdd(offset, data);
-}
-#else
-template <typename T>
-inline HOSTDEVICE void AccumulateRois(T* offset, T data) {
-  *offset += data;
-}
-#endif
-
-#if defined(__NVCC__) || defined(__HIPCC__)
-template <typename T>
-DEVICE T MaxFunctor(const T x, const T y) {
-  return max(x, y);
-}
-template <typename T>
-DEVICE T MinFunctor(const T x, const T y) {
-  return min(x, y);
-}
-#else
-template <typename T>
-inline HOSTDEVICE T MaxFunctor(const T x, const T y) {
-  return std::max(x, y);
-}
-template <typename T>
-inline HOSTDEVICE T MinFunctor(const T x, const T y) {
-  return std::max(x, y);
-}
-#endif
-
-template <typename T>
-inline HOSTDEVICE static T PrRoIPoolingGetCoeff(T dh, T dw) {
-  dw = dw > 0 ? dw : -dw;
-  dh = dh > 0 ? dh : -dh;
-  return (1.0f - dh) * (1.0f - dw);
-}
-
-template <typename T, typename H, typename W>
-inline HOSTDEVICE static T PrRoIPoolingInterpolation(
-    const T* data, const H h, const W w, const int height, const int width) {
-  T retVal = 0.0f;
-  int h1 = floorf(h);
-  int w1 = floorf(w);
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  h1 = floorf(h) + 1;
-  w1 = floorf(w);
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  h1 = floorf(h);
-  w1 = floorf(w) + 1;
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  h1 = floorf(h) + 1;
-  w1 = floorf(w) + 1;
-  retVal +=
-      PrRoIPoolingGetData(data, h1, w1, height, width) *
-      PrRoIPoolingGetCoeff(h - static_cast<T>(h1), w - static_cast<T>(w1));
-  return retVal;
-}
-
-template <typename T>
-inline HOSTDEVICE T PrRoIPoolingSingleCoorIntegral(T s, T t, T c1, T c2) {
-  return 0.5f * (t * t - s * s) * c2 +
-         (t - 0.5f * t * t - s + 0.5f * s * s) * c1;
-}
-
-template <typename T>
-inline HOSTDEVICE void PrRoIPoolingCoorBackward(int s_w,
-                                                int e_w,
-                                                int s_h,
-                                                int e_h,
-                                                int width,
-                                                int height,
-                                                T win_start_w,
-                                                T win_start_h,
-                                                T win_end_w,
-                                                T win_end_h,
-                                                int pw,
-                                                int ph,
-                                                const int pooled_width,
-                                                const int pooled_height,
-                                                T win_size,
-                                                const float spatial_scale,
-                                                const T* this_bottom_data,
-                                                const T* this_top_data,
-                                                T* this_data_grad,
-                                                const T* this_out_grad) {
-  T g_x1_y = 0.f;
-  T g_x2_y = 0.f;
-  T g_x_y1 = 0.f;
-  T g_x_y2 = 0.f;
-
-  for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-    g_x1_y += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_h, static_cast<T>(h_iter)) - h_iter,
-        MinFunctor<T>(win_end_h, static_cast<T>(h_iter + 1)) - h_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter, win_start_w, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter + 1, win_start_w, height, width));
-
-    g_x2_y += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_h, static_cast<T>(h_iter)) - h_iter,
-        MinFunctor<T>(win_end_h, static_cast<T>(h_iter + 1)) - h_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter, win_end_w, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, h_iter + 1, win_end_w, height, width));
-  }
-
-  for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-    g_x_y1 += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_w, static_cast<T>(w_iter)) - w_iter,
-        MinFunctor<T>(win_end_w, static_cast<T>(w_iter + 1)) - w_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_start_h, w_iter, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_start_h, w_iter + 1, height, width));
-
-    g_x_y2 += PrRoIPoolingSingleCoorIntegral(
-        MaxFunctor<T>(win_start_w, static_cast<T>(w_iter)) - w_iter,
-        MinFunctor<T>(win_end_w, static_cast<T>(w_iter + 1)) - w_iter,
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_end_h, w_iter, height, width),
-        PrRoIPoolingInterpolation(
-            this_bottom_data, win_end_h, w_iter + 1, height, width));
-  }
-
-  float partial_x1 = -g_x1_y + (win_end_h - win_start_h) * (*this_top_data);
-  float partial_y1 = -g_x_y1 + (win_end_w - win_start_w) * (*this_top_data);
-  float partial_x2 = g_x2_y - (win_end_h - win_start_h) * (*this_top_data);
-  float partial_y2 = g_x_y2 - (win_end_w - win_start_w) * (*this_top_data);
-
-  partial_x1 = partial_x1 / win_size * spatial_scale;
-  partial_x2 = partial_x2 / win_size * spatial_scale;
-  partial_y1 = partial_y1 / win_size * spatial_scale;
-  partial_y2 = partial_y2 / win_size * spatial_scale;
-
-  AccumulateRois<T>(
-      this_data_grad + 0,
-      (partial_x1 * (1.0 - static_cast<T>(pw) / pooled_width) +
-       partial_x2 * (1.0 - static_cast<T>(pw + 1) / pooled_width)) *
-          (*this_out_grad));
-  AccumulateRois<T>(
-      this_data_grad + 1,
-      (partial_y1 * (1.0 - static_cast<T>(ph) / pooled_height) +
-       partial_y2 * (1.0 - static_cast<T>(ph + 1) / pooled_height)) *
-          (*this_out_grad));
-  AccumulateRois<T>(this_data_grad + 2,
-                    (partial_x2 * static_cast<T>(pw + 1) / pooled_width +
-                     partial_x1 * static_cast<T>(pw) / pooled_width) *
-                        (*this_out_grad));
-  AccumulateRois<T>(this_data_grad + 3,
-                    (partial_y2 * static_cast<T>(ph + 1) / pooled_height +
-                     partial_y1 * static_cast<T>(ph) / pooled_height) *
-                        (*this_out_grad));
-}
-
-template <typename T, typename DeviceContext>
-class CPUPRROIPoolOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* out = ctx.Output<phi::DenseTensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int input_channels = in_dims[1];
-    auto output_channels = input_channels;
-    int height = in_dims[2];
-    int width = in_dims[3];
-    int rois_num = rois->dims()[0];
-    if (rois_num == 0) return;
-
-    auto in_stride = phi::stride(in_dims);
-    auto out_stride = phi::stride(out->dims());
-
-    const T* input_data = in->data<T>();
-
-    phi::DenseTensor rois_batch_id_list;
-    rois_batch_id_list.Resize({rois_num});
-    int* rois_batch_id_data =
-        rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-    if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-      auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-      auto* batch_index = batchroinum->data<int64_t>();
-      int rois_batch_size = batchroinum->dims()[0];
-      size_t c = 0;
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (int64_t k = 0; k < batch_index[n]; ++k) {
-          rois_batch_id_data[c] = n;
-          c = c + 1;
-        }
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(rois->lod().empty(),
-                        false,
-                        platform::errors::InvalidArgument(
-                            "The lod of Input ROIs should not be empty when "
-                            "BatchRoINums is None!"));
-      auto rois_lod = rois->lod().back();
-      int rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(rois_batch_size,
-                        batch_size,
-                        platform::errors::InvalidArgument(
-                            "The rois_batch_size and input(X)'s "
-                            "batch_size should be the same but received"
-                            "rois_batch_size: %d and batch_size: %d",
-                            rois_batch_size,
-                            batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num_with_lod,
-          rois_num,
-          platform::errors::InvalidArgument("The rois_num from input should be "
-                                            "equal to the rois_num from lod, "
-                                            "but received rois_num from input: "
-                                            "%d and the rois_num from lod: %d.",
-                                            rois_num_with_lod,
-                                            rois_num));
-
-      // calculate batch id index for each roi according to LoD
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          rois_batch_id_data[i] = n;
-        }
-      }
-    }
-
-    T* output_data = out->mutable_data<T>(ctx.GetPlace());
-    const T* input_rois = rois->data<T>();
-    // calculate prroipooling, parallel processing can be implemented per ROI
-    for (int n = 0; n < rois_num; ++n) {
-      // set roi batch id
-      int roi_batch_id = rois_batch_id_data[n];
-
-      // [start, end) interval for spatial sampling
-      const T* offset_input_rois = input_rois + n * 4;
-      T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-      T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-      T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-      T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-
-      T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-      T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-      // Compute w and h at input feature map
-      T bin_size_h = roi_height / static_cast<T>(pooled_height);
-      T bin_size_w = roi_width / static_cast<T>(pooled_width);
-      T win_size = std::max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-
-      // calculate each pixel of the output feature map.
-      int out_roi_offset = n * out_stride[0];
-      for (int c = 0; c < output_channels; ++c) {
-        // per category
-        int out_plane_offset = out_roi_offset + c * out_stride[1];
-        for (int ph = 0; ph < pooled_height; ++ph) {
-          int out_row_offset = out_plane_offset + ph * out_stride[2];
-          for (int pw = 0; pw < pooled_width; ++pw) {
-            // calculate w and h at input feature map
-            T win_start_h = static_cast<T>(ph) * bin_size_h + roi_start_h;
-            T win_start_w = static_cast<T>(pw) * bin_size_w + roi_start_w;
-            T win_end_h = win_start_h + bin_size_h;
-            T win_end_w = win_start_w + bin_size_w;
-            //  Add roi offsets and clip to input boundaries
-            int s_w = std::floor(win_start_w);
-            int e_w = std::ceil(win_end_w);
-            int s_h = std::floor(win_start_h);
-            int e_h = std::ceil(win_end_h);
-
-            int output_index = out_row_offset + pw;
-            int input_channel = c;
-            int input_plane_offset =
-                roi_batch_id * in_stride[0] + input_channel * in_stride[1];
-            const T* offset_input_data = input_data + input_plane_offset;
-            T sum_out = 0.;
-
-            if (win_size > static_cast<T>(0.0)) {
-              for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-                for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-                  sum_out += PrRoIPoolingMatCalculation(
-                      offset_input_data,
-                      h_iter,
-                      w_iter,
-                      h_iter + 1,
-                      w_iter + 1,
-                      std::max(win_start_h, static_cast<T>(h_iter)),
-                      std::max(win_start_w, static_cast<T>(w_iter)),
-                      std::min(win_end_h,
-                               static_cast<T>(h_iter) + static_cast<T>(1.0)),
-                      std::min(win_end_w,
-                               static_cast<T>(w_iter) + static_cast<T>(1.0)),
-                      height,
-                      width);
-                }
-              }
-
-              output_data[output_index] = sum_out / win_size;
-            } else {
-              output_data[output_index] = 0.;
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-template <typename T, typename DeviceContext>
-class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<phi::DenseTensor>("X");
-    auto* out = ctx.Input<phi::DenseTensor>("Out");
-    auto* rois = ctx.Input<phi::DenseTensor>("ROIs");
-    auto* output_grad =
-        ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* input_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* input_roi_grad =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("ROIs"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-
-    if (input_grad || input_roi_grad) {
-      auto in_dims = in->dims();
-      auto* in_data = in->data<T>();
-      auto* out_data = out->data<T>();
-
-      int input_channels = in_dims[1];
-      auto output_channels = input_channels;
-      int height = in_dims[2];
-      int width = in_dims[3];
-      int rois_num = rois->dims()[0];
-
-      // set roi batch id
-      phi::DenseTensor rois_batch_id_list;
-      rois_batch_id_list.Resize({rois_num});
-      int* rois_batch_id_data =
-          rois_batch_id_list.mutable_data<int>(ctx.GetPlace());
-      if (ctx.HasInput("BatchRoINums") || rois->lod().empty()) {
-        auto* batchroinum = ctx.Input<phi::DenseTensor>("BatchRoINums");
-        auto* batch_index = batchroinum->data<int64_t>();
-        int rois_batch_size = batchroinum->dims()[0];
-        size_t c = 0;
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (int64_t k = 0; k < batch_index[n]; ++k) {
-            rois_batch_id_data[c] = n;
-            c = c + 1;
-          }
-        }
-      } else {
-        auto rois_lod = rois->lod().back();
-        int rois_batch_size = rois_lod.size() - 1;
-        // calculate batch id index for each roi according to LoD
-        for (int n = 0; n < rois_batch_size; ++n) {
-          for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-            rois_batch_id_data[i] = n;
-          }
-        }
-      }
-
-      const T* input_rois = rois->data<T>();
-      const T* output_grad_data = output_grad->data<T>();
-
-      input_grad->mutable_data<T>(ctx.GetPlace());
-      input_roi_grad->mutable_data<T>(ctx.GetPlace());
-      // set gradient of X to be 0. before backpropagate.
-      phi::funcs::SetConstant<DeviceContext, T> set_zero;
-      set_zero(ctx.template device_context<DeviceContext>(),
-               input_grad,
-               static_cast<T>(0));
-      set_zero(ctx.template device_context<DeviceContext>(),
-               input_roi_grad,
-               static_cast<T>(0));
-
-      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-      T* input_roi_grad_data = input_roi_grad->mutable_data<T>(ctx.GetPlace());
-
-      // backpropagate gradient per output pixel
-      int output_grad_size = output_grad->numel();
-      for (int i = 0; i < output_grad_size; ++i) {
-        // The output is in order (n, c, ph, pw)
-        int pw = i % pooled_width;
-        int ph = (i / pooled_width) % pooled_height;
-        int c = (i / pooled_width / pooled_height) % output_channels;
-        int n = i / pooled_width / pooled_height / output_channels;
-
-        // set roi_batch_id
-        int roi_batch_id = rois_batch_id_data[n];
-        int input_channel = c;
-        int input_offset =
-            (roi_batch_id * input_channels + input_channel) * height * width;
-        T* offset_input_grad_data = input_grad_data + input_offset;
-        const T* offset_output_grad_data = output_grad_data + i;
-        const T* offset_out_data = out_data + i;
-
-        // [start, end) interval for spatial sampling
-        const T* offset_input_rois = input_rois + n * 4;
-        T roi_start_w = static_cast<T>(offset_input_rois[0]) * spatial_scale;
-        T roi_start_h = static_cast<T>(offset_input_rois[1]) * spatial_scale;
-        T roi_end_w = static_cast<T>(offset_input_rois[2]) * spatial_scale;
-        T roi_end_h = static_cast<T>(offset_input_rois[3]) * spatial_scale;
-        T* offset_input_roi_grad_data = input_roi_grad_data + n * 4;
-
-        T roi_width = std::max(roi_end_w - roi_start_w, static_cast<T>(0.0));
-        T roi_height = std::max(roi_end_h - roi_start_h, static_cast<T>(0.0));
-
-        // Compute w and h at input feature map
-        T bin_size_h = roi_height / static_cast<T>(pooled_height);
-        T bin_size_w = roi_width / static_cast<T>(pooled_width);
-
-        T win_start_w = roi_start_w + bin_size_w * pw;
-        T win_start_h = roi_start_h + bin_size_h * ph;
-        T win_end_w = win_start_w + bin_size_w;
-        T win_end_h = win_start_h + bin_size_h;
-
-        T win_size = std::max(static_cast<T>(0.0), bin_size_w * bin_size_h);
-
-        T sum_out = win_size == static_cast<T>(0.)
-                        ? static_cast<T>(0.)
-                        : *offset_output_grad_data / win_size;
-
-        int s_w = std::floor(win_start_w);
-        int e_w = std::ceil(win_end_w);
-        int s_h = std::floor(win_start_h);
-        int e_h = std::ceil(win_end_h);
-
-        for (int w_iter = s_w; w_iter < e_w; ++w_iter) {
-          for (int h_iter = s_h; h_iter < e_h; ++h_iter) {
-            PrRoIPoolingMatDistributeDiff<T>(
-                offset_input_grad_data,
-                sum_out,
-                h_iter,
-                w_iter,
-                h_iter + 1,
-                w_iter + 1,
-                std::max(win_start_h, static_cast<T>(h_iter)),
-                std::max(win_start_w, static_cast<T>(w_iter)),
-                std::min(win_end_h,
-                         static_cast<T>(h_iter) + static_cast<T>(1.0)),
-                std::min(win_end_w,
-                         static_cast<T>(w_iter) + static_cast<T>(1.0)),
-                height,
-                width);
-          }
-        }
-
-        const T* offset_in_data = in_data + input_offset;
-        PrRoIPoolingCoorBackward<T>(s_w,
-                                    e_w,
-                                    s_h,
-                                    e_h,
-                                    width,
-                                    height,
-                                    win_start_w,
-                                    win_start_h,
-                                    win_end_w,
-                                    win_end_h,
-                                    pw,
-                                    ph,
-                                    pooled_width,
-                                    pooled_height,
-                                    win_size,
-                                    spatial_scale,
-                                    offset_in_data,
-                                    offset_out_data,
-                                    offset_input_roi_grad_data,
-                                    offset_output_grad_data);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
deleted file mode 100644
index 62e805e323f84..0000000000000
--- a/paddle/fluid/operators/random_crop_op.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/random_crop_op.h"
-
-namespace paddle {
-namespace operators {
-
-class RandomCropOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GT(
-        x_dim.size(),
-        static_cast<int64_t>(shape.size()),
-        platform::errors::InvalidArgument(
-            "The dimensions of Input(X) must be greater than the length of "
-            "Attr(shape),"
-            "But received dimensions of Input(X) is [%d], receivecd length"
-            "of Attr(shape) is [%d].",
-            x_dim.size(),
-            static_cast<int64_t>(shape.size())));
-    auto out_dim = phi::vectorize<int>(x_dim);
-    for (size_t i = 1; i <= shape.size(); ++i) {
-      size_t x_i = x_dim.size() - i;
-      size_t shape_i = shape.size() - i;
-      if (ctx->IsRuntime() ||
-          (x_dim[static_cast<int>(x_i)] > 0 && shape[shape_i] > 0)) {
-        PADDLE_ENFORCE_GE(
-            x_dim[x_i],
-            shape[shape_i],
-            platform::errors::InvalidArgument(
-                "The dimensions of Input(X) must be larger than Attr(shape),"
-                "But received dimensions of Input(X) is [%d], received"
-                "size of Attr(shape) is [%d].",
-                x_dim[x_i],
-                shape[shape_i]));
-      }
-      out_dim[x_i] = shape[shape_i];
-    }
-    ctx->SetOutputDim("Out", phi::make_ddim(out_dim));
-  }
-
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"),
-                          ctx.GetPlace());
-  }
-};
-
-class RandomCropOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "A batch of instances to random crop.");
-    AddInput("Seed", "The random seed.");
-    AddOutput("Out", "The cropped instance batch.");
-    AddOutput("SeedOut", "The random seed after random cropping.")
-        .AsIntermediate();
-    AddAttr<std::vector<int>>("shape", "The shape of a cropped instance.");
-    AddAttr<int>("startup_seed",
-                 "If the input 'Seed' is not initialized, the 'startup_seed' "
-                 "will be used to replace it. Even so, the seed after random "
-                 "crop will also be outputed to the 'SeedOut'.")
-        .SetDefault(0);
-    AddComment(R"DOC(
-      This operator takes a batch of instance, and do random cropping on each instance.
-      It means that cropping positions differs on each instance, which is determined
-      by an uniform random generator. All cropped instances have the same shape, which
-      is determined by the operator's attribute 'shape'.
-    )DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    random_crop,
-    ops::RandomCropOp,
-    ops::RandomCropOpMaker,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-
-PD_REGISTER_STRUCT_KERNEL(random_crop,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::RandomCropKernel,
-                          float,
-                          int,
-                          double,
-                          uint8_t,
-                          int16_t) {}
diff --git a/paddle/fluid/operators/random_crop_op.cu b/paddle/fluid/operators/random_crop_op.cu
deleted file mode 100644
index 33182dff93fa4..0000000000000
--- a/paddle/fluid/operators/random_crop_op.cu
+++ /dev/null
@@ -1,27 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/random_crop_op.h"
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(random_crop,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::RandomCropKernel,
-                          float,
-                          int,
-                          double,
-                          uint8_t,
-                          int16_t) {}
diff --git a/paddle/fluid/operators/random_crop_op.h b/paddle/fluid/operators/random_crop_op.h
deleted file mode 100644
index fc625826b9a91..0000000000000
--- a/paddle/fluid/operators/random_crop_op.h
+++ /dev/null
@@ -1,232 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/for_range.h"
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#include <thrust/random.h>
-#endif
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext>
-struct Random;
-
-template <>
-struct Random<phi::CPUContext> {
-  using Engine = std::minstd_rand;
-
-  template <typename T>
-  using UniformIntDist = std::uniform_int_distribution<T>;
-};
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <>
-struct Random<phi::GPUContext> {
-  using Engine = thrust::minstd_rand;
-
-  template <typename T>
-  using UniformIntDist = thrust::uniform_int_distribution<T>;
-};
-#endif
-
-template <typename T>
-HOSTDEVICE inline void StridedMemcpy(const T* x,
-                                     const size_t* x_dims,
-                                     T* out,
-                                     const size_t* out_dims,
-                                     int i,
-                                     int rank,
-                                     size_t prod_x_remain,
-                                     size_t prod_out_remain,
-                                     const size_t* offsets) {
-  size_t x_dim_i = x_dims[i];
-  size_t out_dim_i = out_dims[i];
-  size_t x_stride = prod_x_remain / x_dim_i;
-  size_t out_stride = prod_out_remain / out_dim_i;
-  size_t offset_i = offsets[i];
-
-  if (i == rank - 1) {
-    x += offset_i;
-    for (size_t j = 0; j < out_dim_i; ++j) {
-      *out++ = *x++;
-    }
-  } else {
-    x += offset_i * x_stride;
-    for (size_t j = 0; j < out_dim_i; ++j) {
-      StridedMemcpy<T>(
-          x, x_dims, out, out_dims, i + 1, rank, x_stride, out_stride, offsets);
-      x += x_stride;
-      out += out_stride;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-struct RandomCropFunctor {
-  const T* x_;
-  T* out_;
-  size_t x_dims_[9];
-  size_t out_dims_[9];
-  int num_batchsize_dims_;
-  int rank_;
-  int64_t seed_;
-
-  size_t prod_batchsize_dims_;
-  size_t prod_x_ins_dims_;
-  size_t prod_out_ins_dims_;
-
-  RandomCropFunctor(const T* x,
-                    T* out,
-                    const framework::DDim& x_dims,
-                    const framework::DDim& out_dims,
-                    int num_batchsize_dims,
-                    int64_t seed)
-      : x_(x),
-        out_(out),
-        num_batchsize_dims_(num_batchsize_dims),
-        rank_(x_dims.size()),
-        seed_(seed) {
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        out_dims.size(),
-        platform::errors::InvalidArgument(
-            "The dimensions of Input(X) must equal to be the dimensions"
-            "of Output(Out), but received dimensions of Input(X) is [%d],"
-            "received dimensions of Output(Out) is [%d].",
-            x_dims.size(),
-            out_dims.size()));
-    PADDLE_ENFORCE_GT(
-        rank_,
-        num_batchsize_dims_,
-        platform::errors::InvalidArgument(
-            "The dimensions of Input(X) must be greater than the diff"
-            "value of Input(X)'s dimensions minus Atrr(shape)'s dimensions,"
-            "But received Input(X)'s dimensions is [%d], received value of"
-            "Input(X)'s dimensions minus Attr(shape)'s dimensions is [%d].",
-            rank_,
-            num_batchsize_dims_));
-    prod_batchsize_dims_ = 1;
-    prod_x_ins_dims_ = 1;
-    prod_out_ins_dims_ = 1;
-    for (size_t i = 0; i < static_cast<size_t>(rank_); ++i) {
-      size_t x_dim_i = x_dims[i];
-      size_t out_dim_i = out_dims[i];
-      x_dims_[i] = x_dim_i;
-      out_dims_[i] = out_dim_i;
-      if (i < static_cast<size_t>(num_batchsize_dims_)) {
-        PADDLE_ENFORCE_EQ(
-            x_dim_i,
-            out_dim_i,
-            platform::errors::InvalidArgument(
-                "The first [%d] dimension value of Input(X) and Output(Out)"
-                "must be equal, but received the [%d] dimension value of"
-                "Input(X) and Output(Out) respectively are [%d] and [%d].",
-                num_batchsize_dims_,
-                i,
-                x_dim_i,
-                out_dim_i));
-        prod_batchsize_dims_ *= x_dim_i;
-      } else {
-        prod_x_ins_dims_ *= x_dim_i;
-        prod_out_ins_dims_ *= out_dim_i;
-      }
-    }
-  }
-
-  HOSTDEVICE void operator()(size_t ins_idx) {
-    typename Random<DeviceContext>::Engine engine(seed_);
-    engine.discard(ins_idx * (rank_ - num_batchsize_dims_));
-    size_t offsets[9] = {};
-    for (int i = num_batchsize_dims_; i < rank_; ++i) {
-      typename Random<DeviceContext>::template UniformIntDist<size_t> dist(
-          0, x_dims_[i] - out_dims_[i]);
-      offsets[i - num_batchsize_dims_] = dist(engine);
-    }
-
-    const T* x = x_ + ins_idx * prod_x_ins_dims_;
-    T* out = out_ + ins_idx * prod_out_ins_dims_;
-
-    StridedMemcpy<T>(x,
-                     x_dims_ + num_batchsize_dims_,
-                     out,
-                     out_dims_ + num_batchsize_dims_,
-                     0,
-                     rank_ - num_batchsize_dims_,
-                     prod_x_ins_dims_,
-                     prod_out_ins_dims_,
-                     offsets);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class RandomCropKernel : public framework::OpKernel<T> {
- public:
-  virtual void Compute(const framework::ExecutionContext& ctx) const {
-    int64_t seed = 0;
-    auto& seed_tensor = GET_DATA_SAFELY(
-        ctx.Input<phi::DenseTensor>("Seed"), "Input", "Seed", "RandomCrop");
-    if (seed_tensor.IsInitialized()) {
-      if (platform::is_cpu_place(seed_tensor.place())) {
-        seed = *seed_tensor.template data<int64_t>();
-      } else {
-        LOG(WARNING) << "It is slow to place seed in GPU memory. Please verify "
-                        "your program";
-        phi::DenseTensor cpu_seed;
-        framework::TensorCopySync(seed_tensor, platform::CPUPlace(), &cpu_seed);
-        seed = *cpu_seed.data<int64_t>();
-      }
-    } else {
-      VLOG(5) << "WARNING: The input 'Seed' is not initialized, use attribute "
-                 "'startup_seed' instead.";
-      seed = ctx.Attr<int>("startup_seed");
-    }
-    auto shape = ctx.Attr<std::vector<int>>("shape");
-    auto& x = GET_DATA_SAFELY(
-        ctx.Input<phi::DenseTensor>("X"), "Input", "X", "RandomCrop");
-    auto& out = GET_DATA_SAFELY(
-        ctx.Output<phi::DenseTensor>("Out"), "Output", "Out", "RandomCrop");
-
-    int num_batchsize_dims = x.dims().size() - shape.size();
-    RandomCropFunctor<DeviceContext, T> functor(
-        x.template data<T>(),
-        out.template mutable_data<T>(ctx.GetPlace()),
-        x.dims(),
-        out.dims(),
-        num_batchsize_dims,
-        seed);
-    platform::ForRange<DeviceContext> for_range(
-        ctx.template device_context<DeviceContext>(),
-        functor.prod_batchsize_dims_);
-
-    for_range(functor);
-
-    Random<phi::CPUContext>::Engine engine(seed);
-    engine.discard(functor.prod_batchsize_dims_ *
-                   (functor.rank_ - functor.num_batchsize_dims_));
-    *ctx.Output<phi::DenseTensor>("SeedOut")->mutable_data<int64_t>(
-        phi::make_ddim({1}), platform::CPUPlace()) = engine();
-  }
-};
-
-// TODO(fengjiayi): Backward of random crop op
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
deleted file mode 100644
index da0c2e4a3cbb2..0000000000000
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ /dev/null
@@ -1,242 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/space_to_depth_op.h"
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class SpaceToDepthOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   platform::errors::InvalidArgument(
-                       "Input(X) of SpaceToDepthOp should not be null."));
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   platform::errors::InvalidArgument(
-                       "Output(Out) of SpaceToDepthOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        4,
-        platform::errors::InvalidArgument("input should be a 4D tensor"));
-    auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
-
-    PADDLE_ENFORCE_GT(blocksize,
-                      1,
-                      platform::errors::InvalidArgument(
-                          "The blocksize should be Greater than 1"));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_GT(x_dims[1],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input channel should be Greater than 0"));
-      PADDLE_ENFORCE_GT(x_dims[2],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Height should be Greater than 0"));
-      PADDLE_ENFORCE_GT(x_dims[3],
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Width should be Greater than 0"));
-
-      PADDLE_ENFORCE_EQ(
-          x_dims[1] % (blocksize * blocksize),
-          0,
-          platform::errors::InvalidArgument(
-              "input channel should be divisible of the square of "
-              "SpaceToDepthOp blocksize"));
-      PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize),
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Height should be divisible of the square of "
-                            "SpaceToDepthOp blocksize"));
-      PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize),
-                        0,
-                        platform::errors::InvalidArgument(
-                            "input Width should be divisible of the square of "
-                            "SpaceToDepthOp blocksize"));
-    } else {
-      if (x_dims[1] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[1],
-                          0,
-                          platform::errors::InvalidArgument(
-                              "input channel should be Greater than 0"));
-        PADDLE_ENFORCE_EQ(
-            x_dims[1] % (blocksize * blocksize),
-            0,
-            platform::errors::InvalidArgument(
-                "input channel should be divisible of the square of "
-                "SpaceToDepthOp blocksize"));
-      }
-      if (x_dims[2] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[2],
-                          0,
-                          platform::errors::InvalidArgument(
-                              "input Height should be Greater than 0"));
-        PADDLE_ENFORCE_EQ(
-            x_dims[2] % (blocksize),
-            0,
-            platform::errors::InvalidArgument(
-                "input Height should be divisible of the square of "
-                "SpaceToDepthOp blocksize"));
-      }
-
-      if (x_dims[3] != -1) {
-        PADDLE_ENFORCE_GT(x_dims[3],
-                          0,
-                          platform::errors::InvalidArgument(
-                              "input Width should be Greater than 0"));
-
-        PADDLE_ENFORCE_EQ(
-            x_dims[3] % (blocksize),
-            0,
-            platform::errors::InvalidArgument(
-                "input Width should be divisible of the square of "
-                "SpaceToDepthOp blocksize"));
-      }
-    }
-
-    VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
-            << "Attribute blocksize" << blocksize << std::endl;
-
-    std::vector<int64_t> output_shape(4, 0);  // [B,C,H,W]
-    output_shape[0] = x_dims[0];
-    output_shape[1] = x_dims[1] * blocksize * blocksize;
-    output_shape[2] = x_dims[2] / blocksize;
-    output_shape[3] = x_dims[3] / blocksize;
-
-    auto out_dims = phi::make_ddim(output_shape);
-
-    ctx->SetOutputDim("Out", out_dims);
-
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-};
-
-class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X",
-             "(Tensor). The input should be a 4D tensor B * C * W * H of "
-             "SpaceToDepthOp "
-             "operator.");
-    AddOutput("Out",
-              "(Tensor), The output should be a 4D tensor B * C2 * W2 * H2 of "
-              "SpaceToDepthOp operator.");
-    AddAttr<int64_t>(
-        "blocksize",
-        "(int64_t, default 2) blocksize used to do change Space To Depth.")
-        .SetDefault(2)
-        .GreaterThan(1);
-    AddComment(R"DOC(
-        reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,
-
-        Reshape Input(X) into the shape according to Attr(blocksize). The
-        data in Input(X) are unchanged.
-
-        Examples:
-
-            1. Given a 4-D tensor Input(X) with a shape [128, 2048, 26, 26], and the blocksize is 2, the reorg operator will transform Input(X)
-            into a 4-D tensor with shape [128, 2048, 13, 13] and leaving Input(X)'s data unchanged.
-
-    )DOC");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SpaceToDepthGradOpNoBufferVarsInferer, "X");
-
-template <typename T>
-class SpaceToDepthGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("space_to_depth_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("X", this->Input("X"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class SpaceToDepthGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(
-        ctx->HasInput("X"),
-        platform::errors::InvalidArgument("Input(X) shouldn't be null."));
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   platform::errors::InvalidArgument(
-                       "Input(Out@GRAD) shouldn't be null."));
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(
-                              ctx, framework::GradVarName("Out")),
-                          ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(space_to_depth,
-                  ops::SpaceToDepthOp,
-                  ops::SpaceToDepthOpMaker,
-                  ops::SpaceToDepthGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SpaceToDepthGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(space_to_depth_grad,
-                  ops::SpaceToDepthGradOp,
-                  ops::SpaceToDepthGradOpNoBufferVarsInferer);
-PD_REGISTER_STRUCT_KERNEL(space_to_depth,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(space_to_depth_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthGradKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu
deleted file mode 100644
index 7f62509ee7d2e..0000000000000
--- a/paddle/fluid/operators/space_to_depth_op.cu
+++ /dev/null
@@ -1,35 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/space_to_depth_op.h"
-
-namespace plat = paddle::platform;
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(space_to_depth,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
-PD_REGISTER_STRUCT_KERNEL(space_to_depth_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SpaceToDepthGradKernel,
-                          int,
-                          int64_t,
-                          float,
-                          double) {}
diff --git a/paddle/fluid/operators/space_to_depth_op.h b/paddle/fluid/operators/space_to_depth_op.h
deleted file mode 100644
index 18ff67c6132be..0000000000000
--- a/paddle/fluid/operators/space_to_depth_op.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-#define PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-#endif  // PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/for_range.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class space_to_depth_compute {
- public:
-  HOSTDEVICE space_to_depth_compute(const T *x,
-                                    int64_t w,
-                                    int64_t h,
-                                    int64_t c,
-                                    int64_t batch,
-                                    int64_t blocksize,
-                                    int64_t forward,
-                                    T *out)
-      : x_(x),
-        w_(w),
-        h_(h),
-        c_(c),
-        batch_(batch),
-        blocksize_(blocksize),
-        forward_(forward),
-        out_(out) {}
-
-  HOSTDEVICE void operator()(int64_t in_index) {
-    int64_t out_c = c_ / (blocksize_ * blocksize_);
-    // calculate each dim position with index of tensor
-    int64_t b = in_index / (c_ * h_ * w_);
-    int64_t k = (in_index % (c_ * h_ * w_)) / (h_ * w_);
-    int64_t j = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) / w_;
-    int64_t i = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) % w_;
-
-    int64_t c2 = k % out_c;
-    int64_t offset = k / out_c;
-    int64_t w2 = i * blocksize_ + offset % blocksize_;
-    int64_t h2 = j * blocksize_ + offset / blocksize_;
-    int64_t out_index =
-        w2 + w_ * blocksize_ * (h2 + h_ * blocksize_ * (c2 + out_c * b));
-    if (forward_)
-      out_[out_index] = x_[in_index];
-    else
-      out_[in_index] = x_[out_index];
-  }
-
- private:
-  const T *x_;
-  int64_t w_, h_, c_, batch_, blocksize_, forward_;
-  T *out_;
-};
-
-template <typename T, typename DeviceContext>
-class SpaceToDepthKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *out = context.Output<phi::DenseTensor>("Out");
-    auto *x = context.Input<phi::DenseTensor>("X");
-    auto blocksize = context.Attr<int64_t>("blocksize");
-    auto in_dims = x->dims();
-    out->mutable_data(context.GetPlace(), x->type());
-
-    auto out_dims = out->dims();
-    auto B = in_dims[0];
-    auto C = in_dims[1];
-    auto H = in_dims[2];
-    auto W = in_dims[3];
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(x->numel()));
-
-    auto *x_data = x->data<T>();
-    auto *out_data = out->data<T>();
-    paddle::operators::space_to_depth_compute<T> computer(
-        x_data, W, H, C, B, blocksize, 1, out_data);
-    for_range(computer);
-
-    out->Resize(out_dims);
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SpaceToDepthGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *d_out =
-        context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *d_x = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto blocksize = context.Attr<int64_t>("blocksize");
-    auto in_dims = d_x->dims();
-    d_x->mutable_data(context.GetPlace(), d_out->type());
-
-    auto B = in_dims[0];
-    auto C = in_dims[1];
-    auto H = in_dims[2];
-    auto W = in_dims[3];
-
-    platform::ForRange<DeviceContext> for_range(
-        context.template device_context<DeviceContext>(),
-        static_cast<size_t>(d_x->numel()));
-
-    auto *dx_data = d_x->data<T>();
-    auto *dout_data = d_out->data<T>();
-
-    paddle::operators::space_to_depth_compute<T> computer(
-        dout_data, W, H, C, B, blocksize, 0, dx_data);
-    for_range(computer);
-
-    d_x->Resize(in_dims);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
deleted file mode 100644
index 0f2f727dd9135..0000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ /dev/null
@@ -1,232 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/squared_l2_distance_op.h"
-
-#include <memory>
-
-#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-
-namespace paddle {
-namespace operators {
-
-class SquaredL2DistanceOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SquaredL2DistanceOp");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "SquaredL2DistanceOp");
-    OP_INOUT_CHECK(ctx->HasOutput("sub_result"),
-                   "Output",
-                   "sub_result",
-                   "SquaredL2DistanceOp");
-    OP_INOUT_CHECK(
-        ctx->HasOutput("Out"), "Output", "Out", "SquaredL2DistanceOp");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-
-    PADDLE_ENFORCE_EQ(phi::arity(x_dims),
-                      phi::arity(y_dims),
-                      platform::errors::InvalidArgument(
-                          "Input(X) and Input(X) of SquaredL2DistanceOp should "
-                          "have same dimensions. "
-                          "But received X's shape = [%s] and Y's shape = [%s], "
-                          "the dimensions are %d and %d respectively",
-                          x_dims,
-                          y_dims,
-                          phi::arity(x_dims),
-                          phi::arity(y_dims)));
-
-    int rank = phi::arity(x_dims);
-    PADDLE_ENFORCE_GE(
-        rank,
-        2,
-        platform::errors::InvalidArgument(
-            "Input dimensions of SquaredL2DistanceOp should be at least 2."
-            "But received shape = [%s] and dimension is %d.",
-            x_dims,
-            rank));
-    bool check = true;
-    if ((!ctx->IsRuntime()) &&
-        (phi::product(x_dims) <= 0 || phi::product(y_dims) <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          product(x_dims) / x_dims[0],
-          product(y_dims) / y_dims[0],
-          platform::errors::InvalidArgument(
-              "Input(X) and Input(Y) of SquaredL2DistanceOp should "
-              "have same dimensions."
-              "But received X's shape = [%s] and Y's shape = [%s]"
-              ", the products are %d and %d respectively",
-              x_dims,
-              y_dims,
-              product(x_dims) / x_dims[0],
-              product(y_dims) / y_dims[0]));
-    }
-    check = true;
-    if ((!ctx->IsRuntime()) && (y_dims[0] <= 0 || x_dims[0] <= 0)) {
-      check = false;
-    }
-    if (check) {
-      PADDLE_ENFORCE_EQ(
-          y_dims[0] == 1 || y_dims[0] == x_dims[0],
-          true,
-          platform::errors::InvalidArgument(
-              "First dimension of Input(Y) of SquaredL2DistanceOp "
-              "must be equal to 1 or to first dimension of Input(X)."
-              "But received X's shape = [%s] and Y's shape = [%s],"
-              "the first dimensions are %d and %d respectively",
-              x_dims,
-              y_dims,
-              x_dims[0],
-              y_dims[0]));
-    }
-    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
-    ctx->SetOutputDim("Out", {x_dims[0], 1});
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-DECLARE_NO_NEED_BUFFER_VARS_INFERER(SquaredL2DistanceGradOpNoBufferVarsInferer,
-                                    "X",
-                                    "Y");
-
-template <typename T>
-class SquaredL2DistanceGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("squared_l2_distance_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("sub_result", this->Output("sub_result"));
-    op->SetInput("X", this->Input("X"));
-    op->SetInput("Y", this->Input("Y"));
-
-    op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
-    op->SetOutput(framework::GradVarName("Y"), this->InputGrad("Y"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
-    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
-    AddOutput("sub_result",
-              "(Tensor) Buffering subtraction result which "
-              "will be reused in backward.")
-        .AsIntermediate();
-    AddOutput("Out", "(Tensor) Squared l2 distance between input and target.");
-    AddComment(R"DOC(
-SquaredL2Distance operator
-
-This operator will cacluate the squared L2 distance for the input and
-the target. Number of distance value will be equal to the first dimension
-of input. First dimension of the target could be equal to the input or to 1.
-If the first dimension of target is 1, the operator will broadcast target's
-first dimension to input's first dimension. During backward propagation,
-the user can decide whether to calculate the gradient of the input or
-the target or both.
-
-Both the input X and Y can carry the LoD (Level of Details) information.
-However, the output only shares the LoD information with input X.
-    )DOC");
-  }
-};
-
-class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("sub_result"),
-                   "Input",
-                   "sub_result",
-                   "SquaredL2DistanceGradOp");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   "Out@GRAD",
-                   "SquaredL2DistanceGradOp");
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(
-          out_dims[0],
-          x_dims[0],
-          platform::errors::InvalidArgument(
-              "First dimension of output gradient and Input(X) "
-              "of SquaredL2DistanceGradOp must be equal "
-              "But received X's shape = [%s] and grad's shape = [%s], "
-              "the first dimensions are %d and %d respectively",
-              x_dims,
-              out_dims,
-              x_dims[0],
-              out_dims[0]));
-      PADDLE_ENFORCE_EQ(out_dims[1],
-                        1,
-                        platform::errors::InvalidArgument(
-                            "Second dimension of output gradient of "
-                            "SquaredL2DistanceGradOp must be 1. "
-                            "But received grad's shape = [%s], "
-                            "with second dimension %d",
-                            out_dims,
-                            out_dims[1]));
-    }
-    auto x_grad_name = framework::GradVarName("X");
-    auto y_grad_name = framework::GradVarName("Y");
-    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
-    if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "sub_result"),
-        ctx.GetPlace());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(
-    squared_l2_distance,
-    ops::SquaredL2DistanceOp,
-    ops::SquaredL2DistanceOpMaker,
-    ops::SquaredL2DistanceGradOpMaker<paddle::framework::OpDesc>,
-    ops::SquaredL2DistanceGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(squared_l2_distance_grad,
-                  ops::SquaredL2DistanceGradOp,
-                  ops::SquaredL2DistanceGradOpNoBufferVarsInferer);
-
-PD_REGISTER_STRUCT_KERNEL(
-    squared_l2_distance, CPU, ALL_LAYOUT, ops::SquaredL2DistanceKernel, float) {
-}
-PD_REGISTER_STRUCT_KERNEL(squared_l2_distance_grad,
-                          CPU,
-                          ALL_LAYOUT,
-                          ops::SquaredL2DistanceGradKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cu b/paddle/fluid/operators/squared_l2_distance_op.cu
deleted file mode 100644
index 4411df4d9ab7f..0000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ /dev/null
@@ -1,24 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/squared_l2_distance_op.h"
-
-namespace ops = paddle::operators;
-PD_REGISTER_STRUCT_KERNEL(
-    squared_l2_distance, GPU, ALL_LAYOUT, ops::SquaredL2DistanceKernel, float) {
-}
-PD_REGISTER_STRUCT_KERNEL(squared_l2_distance_grad,
-                          GPU,
-                          ALL_LAYOUT,
-                          ops::SquaredL2DistanceGradKernel,
-                          float) {}
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
deleted file mode 100644
index 18039835c55c3..0000000000000
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename DeviceContext>
-class SquaredL2DistanceKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<phi::DenseTensor>("X");
-    auto* in1 = context.Input<phi::DenseTensor>("Y");
-    auto* out0 = context.Output<phi::DenseTensor>("sub_result");
-    auto* out1 = context.Output<phi::DenseTensor>("Out");
-
-    auto in0_dims = in0->dims();
-    auto in1_dims = in1->dims();
-
-    int cols = in0->numel() / in0_dims[0];
-    // reduce dimensions except the first
-    auto x = framework::EigenMatrix<T>::From(
-        *in0, phi::make_ddim({in0_dims[0], cols}));
-    auto y = framework::EigenMatrix<T>::From(
-        *in1, phi::make_ddim({in1_dims[0], cols}));
-
-    out0->mutable_data<T>(context.GetPlace());
-    out1->mutable_data<T>(context.GetPlace());
-    auto sub_result = framework::EigenMatrix<T>::From(*out0);
-    auto z = framework::EigenVector<T>::Flatten(*out1);
-
-    auto& place =
-        *context.template device_context<DeviceContext>().eigen_device();
-    auto x_dims = x.dimensions();
-    auto y_dims = y.dimensions();
-    // buffer the substraction result
-    if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
-      sub_result.device(place) =
-          x -
-          y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
-    } else {
-      sub_result.device(place) = x - y;
-    }
-    auto sub_res_pow2 = sub_result * sub_result;
-    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
-  }
-};
-
-template <typename T, typename DeviceContext>
-class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* in0 = context.Input<phi::DenseTensor>("sub_result");
-    auto* in1 = context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto* x_g = context.Output<phi::DenseTensor>(framework::GradVarName("X"));
-    auto* y_g = context.Output<phi::DenseTensor>(framework::GradVarName("Y"));
-
-    PADDLE_ENFORCE_NOT_NULL(
-        x_g,
-        platform::errors::NotFound(
-            "variable(%s) cannot be found "
-            "in scope for operator 'squared_l2_distance_grad'.",
-            framework::GradVarName("X")));
-    PADDLE_ENFORCE_NOT_NULL(
-        y_g,
-        platform::errors::NotFound(
-            "variable(%s) cannot be found "
-            "in scope for operator 'squared_l2_distance_grad'.",
-            framework::GradVarName("Y")));
-
-    auto sub_result = framework::EigenMatrix<T>::From(*in0);
-    auto out_grad = framework::EigenMatrix<T>::From(*in1);
-
-    auto x_dims = x_g->dims();
-    auto y_dims = y_g->dims();
-
-    int cols = x_g->numel() / x_dims[0];
-    // calculate gradient
-    auto grad_mat = 2 *
-                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
-                    sub_result;
-
-    // propagate back to input
-    auto& eigen_place =
-        *context.template device_context<DeviceContext>().eigen_device();
-
-    x_g->mutable_data<T>(context.GetPlace());
-    // eigen matrix
-    auto x_grad = framework::EigenMatrix<T>::From(
-        *x_g, phi::make_ddim({x_dims[0], cols}));
-    // dimensions are same with subResult
-    x_grad.device(eigen_place) = grad_mat;
-
-    y_g->mutable_data<T>(context.GetPlace());
-
-    PADDLE_ENFORCE_GE(sub_result.dimensions()[0],
-                      y_dims[0],
-                      platform::errors::InvalidArgument(
-                          "First dimension of gradient must be greater or "
-                          "equal than first dimension of target. But received "
-                          "gradient dimension = %d and target dimension is %d.",
-                          sub_result.dimensions()[0],
-                          y_dims[0]));
-
-    if (sub_result.dimensions()[0] == y_dims[0]) {
-      auto y_grad = framework::EigenMatrix<T>::From(
-          *y_g, phi::make_ddim({y_dims[0], cols}));
-      y_grad.device(eigen_place) = -1 * grad_mat;
-    } else {
-      auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
-      auto y_grad = framework::EigenVector<T>::Flatten(*y_g);
-      y_grad.device(eigen_place) = col_sum_res;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc
deleted file mode 100644
index 7265d966b9e2a..0000000000000
--- a/paddle/fluid/operators/tree_conv_op.cc
+++ /dev/null
@@ -1,240 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/tree_conv_op.h"
-
-#include <memory>
-#include <string>
-
-namespace paddle {
-namespace operators {
-class TreeConvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("NodesVector",
-             "(Tensor) The feature vector of every node on the tree. "
-             "The shape of the feature vector must be "
-             "[max_tree_node_size, feature_size].");
-    AddInput("EdgeSet",
-             "(Tensor) The Edges of Tree. The edge must be directional. "
-             "The shape of the edge set must be [max_tree_node_size, 2].");
-    AddInput("Filter",
-             "(Tensor) The feature detector. "
-             "The shape of the filter is "
-             "[feature_size, 3, output_size, num_filters].");
-    AddOutput("Out",
-              "(Tensor) The feature vector of subtrees. "
-              "The shape of the output tensor is [max_tree_node_size, "
-              "output_size, num_filters]. "
-              "The output tensor could be a new feature "
-              "vector for next tree convolution layers.");
-    AddAttr<int>("max_depth",
-                 "(int, default: 2) The depth of feature detector.")
-        .SetDefault(2)
-        .GreaterThan(1);
-    AddComment(R"DOC(
-**Tree-Based Convolution Operator**
-
-Tree-Based Convolution is a kind of convolution based on tree structure.
-Tree-Based Convolution is a part of Tree-Based Convolution Neural Network(TBCNN),
-which is used to classify tree structures, such as Abstract Syntax Tree.
-Tree-Based Convolution proposed a kind of data structure called continuous binary tree,
-which regards multiway tree as binary tree.
-The paper of Tree-Based Convolution Operator is here:
-https://arxiv.org/abs/1409.5718v1
-)DOC");
-  }
-};
-class TreeConvOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(
-        ctx->HasInput("NodesVector"), "Input", "NodesVector", "TreeConv");
-    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "TreeConv");
-    OP_INOUT_CHECK(ctx->HasInput("EdgeSet"), "Input", "EdgeSet", "TreeConv");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "TreeConv");
-
-    auto edge_dims = ctx->GetInputDim("EdgeSet");
-    auto vector_dims = ctx->GetInputDim("NodesVector");
-    auto filter_dims = ctx->GetInputDim("Filter");
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(edge_dims[2],
-                        2,
-                        platform::errors::InvalidArgument(
-                            "Input(EdgeSet) dim[2] should be 2. "
-                            "But received Input(EdgeSet) dim[2] is %d.",
-                            edge_dims[2]));
-    } else {
-      if (edge_dims[2] != -1) {
-        PADDLE_ENFORCE_EQ(edge_dims[2],
-                          2,
-                          platform::errors::InvalidArgument(
-                              "Input(EdgeSet) dim[2] should be 2. "
-                              "But received Input(EdgeSet) dim[2] is %d.",
-                              edge_dims[2]));
-      }
-    }
-    PADDLE_ENFORCE_EQ(edge_dims.size(),
-                      3,
-                      platform::errors::InvalidArgument(
-                          "The dimension of EdgeSet Tensor should be 3. "
-                          "But received the dimension of EdgeSet Tensor is %d.",
-                          edge_dims.size()));
-    PADDLE_ENFORCE_EQ(
-        vector_dims.size(),
-        3,
-        platform::errors::InvalidArgument(
-            "The dimension of NodesVector Tensor should be 3. "
-            "But received the dimension of NodesVector Tensor is %d.",
-            vector_dims.size()));
-    PADDLE_ENFORCE_EQ(filter_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of Filter Tensor should be 4. "
-                          "But received the dimension of Filter Tensor is %d.",
-                          filter_dims.size()));
-
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(filter_dims[1],
-                        3,
-                        platform::errors::InvalidArgument(
-                            "Input(Filter) dim[1] should be 3. "
-                            "But received Input(Filter) dim[1] is %d.",
-                            filter_dims[1]));
-      PADDLE_ENFORCE_EQ(
-          filter_dims[0],
-          vector_dims[2],
-          platform::errors::InvalidArgument(
-              "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]. "
-              "But received Input(Filter) dim[0] = %d, Input(NodesVector) "
-              "dim[2] = %d.",
-              filter_dims[0],
-              vector_dims[2]));
-    } else {
-      if (filter_dims[1] != -1) {
-        PADDLE_ENFORCE_EQ(filter_dims[1],
-                          3,
-                          platform::errors::InvalidArgument(
-                              "Input(Filter) dim[1] should be 3. "
-                              "But received Input(Filter) dim[1] is %d.",
-                              filter_dims[1]));
-      }
-
-      if (filter_dims[0] != -1 && vector_dims[2] != -1) {
-        PADDLE_ENFORCE_EQ(
-            filter_dims[0],
-            vector_dims[2],
-            platform::errors::InvalidArgument(
-                "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]. "
-                "But received Input(Filter) dim[0] = %d, Input(NodesVector) "
-                "dim[2] = %d.",
-                filter_dims[0],
-                vector_dims[2]));
-      }
-    }
-    auto output_dims = phi::make_ddim(
-        {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]});
-    ctx->SetOutputDim("Out", output_dims);
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "NodesVector"),
-        ctx.GetPlace());
-  }
-};
-
-template <typename T>
-class TreeConvGradOpMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("tree_conv_grad");
-
-    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
-    op->SetInput("Filter", this->Input("Filter"));
-    op->SetInput("EdgeSet", this->Input("EdgeSet"));
-    op->SetInput("NodesVector", this->Input("NodesVector"));
-
-    op->SetOutput(framework::GradVarName("NodesVector"),
-                  this->InputGrad("NodesVector"));
-    op->SetOutput(framework::GradVarName("Filter"), this->InputGrad("Filter"));
-
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-class TreeConvGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Filter"), "Input", "Filter", "grad_TreeConv");
-    OP_INOUT_CHECK(
-        ctx->HasInput("EdgeSet"), "Input", "EdgeSet", "grad_TreeConv");
-    OP_INOUT_CHECK(
-        ctx->HasInput("NodesVector"), "Input", "NodesVector", "grad_TreeConv");
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "grad_TreeConv");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("NodesVector")),
-                   "Output",
-                   framework::GradVarName("NodesVector"),
-                   "grad_TreeConv");
-    OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Filter")),
-                   "Output",
-                   framework::GradVarName("Filter"),
-                   "grad_TreeConv");
-
-    auto vectors_dims = ctx->GetInputDim("NodesVector");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("NodesVector"))) {
-      ctx->SetOutputDim(framework::GradVarName("NodesVector"), vectors_dims);
-    }
-  }
-
- protected:
-  phi::KernelKey GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    return phi::KernelKey(
-        OperatorWithKernel::IndicateVarDataType(ctx, "NodesVector"),
-        ctx.GetPlace());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OPERATOR(tree_conv,
-                  ops::TreeConvOp,
-                  ops::TreeConvOpMaker,
-                  ops::TreeConvGradOpMaker<paddle::framework::OpDesc>,
-                  ops::TreeConvGradOpMaker<paddle::imperative::OpBase>);
-
-REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp);
-
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv, CPU, ALL_LAYOUT, ops::TreeConvKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv_grad, CPU, ALL_LAYOUT, ops::TreeConvGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/tree_conv_op.cu b/paddle/fluid/operators/tree_conv_op.cu
deleted file mode 100644
index 1bfcb94013c2b..0000000000000
--- a/paddle/fluid/operators/tree_conv_op.cu
+++ /dev/null
@@ -1,22 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/tree_conv_op.h"
-
-namespace ops = paddle::operators;
-
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv, GPU, ALL_LAYOUT, ops::TreeConvKernel, float, double) {}
-PD_REGISTER_STRUCT_KERNEL(
-    tree_conv_grad, GPU, ALL_LAYOUT, ops::TreeConvGradKernel, float, double) {}
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
deleted file mode 100644
index 18fd5bea29d30..0000000000000
--- a/paddle/fluid/operators/tree_conv_op.h
+++ /dev/null
@@ -1,148 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <iostream>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/tree2col.h"
-#include "paddle/phi/kernels/funcs/blas/blas.h"
-
-namespace paddle {
-namespace operators {
-using DDim = framework::DDim;
-template <typename T, typename DeviceContext>
-class TreeConvKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    phi::funcs::SetConstant<DeviceContext, T> constant;
-
-    auto *Edges = ctx.Input<phi::DenseTensor>("EdgeSet");
-    auto *Embeddings = ctx.Input<phi::DenseTensor>("NodesVector");
-    auto *Filter = ctx.Input<phi::DenseTensor>("Filter");
-    auto *output_emb = ctx.Output<phi::DenseTensor>("Out");
-    int max_depth = ctx.Attr<int>("max_depth");
-
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    phi::DenseTensor W;
-    W.ShareDataWith(*Filter);
-    W.Resize(phi::flatten_to_2d(Filter->dims(), 2));
-
-    int batch_size = static_cast<int>(Edges->dims()[0]);
-    int n = static_cast<int>(Embeddings->dims()[1]);
-    int out_size = static_cast<int>(Filter->dims()[2]);
-    int num_filters = static_cast<int>(Filter->dims()[3]);
-    output_emb->mutable_data<T>({batch_size, n, out_size, num_filters},
-                                ctx.GetPlace());
-
-    auto edge_set_slicedim = phi::slice_ddim(
-        Edges->dims(), 1, static_cast<int>(Edges->dims().size()));
-
-    auto embedding_slicedim = phi::slice_ddim(
-        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
-
-    auto output_slicedim = phi::slice_ddim(
-        output_emb->dims(), 1, static_cast<int>(output_emb->dims().size()));
-
-    output_slicedim = phi::flatten_to_2d(output_slicedim, 1);
-
-    for (int idx = 0; idx < batch_size; idx++) {
-      auto edge_set = Edges->Slice(idx, idx + 1).Resize(edge_set_slicedim);
-      auto embeddings =
-          Embeddings->Slice(idx, idx + 1).Resize(embedding_slicedim);
-      auto out_vec = output_emb->Slice(idx, idx + 1).Resize(output_slicedim);
-      phi::DenseTensor patch;
-      tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
-      constant(dev_ctx, &out_vec, 0);
-      blas.MatMul(patch, W, &out_vec);
-    }
-  }
-};
-template <typename T, typename DeviceContext>
-class TreeConvGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *out_g = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-    auto *in_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("NodesVector"));
-    auto *filter_g =
-        ctx.Output<phi::DenseTensor>(framework::GradVarName("Filter"));
-    int max_depth = ctx.Attr<int>("max_depth");
-    auto *Embeddings = ctx.Input<phi::DenseTensor>("NodesVector");
-    auto *edges = ctx.Input<phi::DenseTensor>("EdgeSet");
-    auto *Filter = ctx.Input<phi::DenseTensor>("Filter");
-    math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    math::Col2TreeFunctor<DeviceContext, T> col2tree;
-    phi::funcs::SetConstant<DeviceContext, T> constant;
-    auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = phi::funcs::GetBlas<DeviceContext, T>(dev_ctx);
-
-    phi::DenseTensor W;
-    W.ShareDataWith(*Filter);
-    W.Resize(phi::flatten_to_2d(Filter->dims(), 1));
-
-    int batch_size = static_cast<int>(Embeddings->dims()[0]);
-
-    auto edge_set_slicedim = phi::slice_ddim(
-        edges->dims(), 1, static_cast<int>(edges->dims().size()));
-
-    auto embedding_slicedim = phi::slice_ddim(
-        Embeddings->dims(), 1, static_cast<int>(Embeddings->dims().size()));
-
-    auto out_grad_dims = phi::slice_ddim(
-        out_g->dims(), 1, static_cast<int>(out_g->dims().size()));
-    out_grad_dims = phi::flatten_to_2d(out_grad_dims, 1);
-    if (filter_g) {
-      filter_g->mutable_data<T>(Filter->dims(), ctx.GetPlace());
-      phi::DenseTensor f_g;
-      f_g.ShareDataWith(*filter_g);
-      f_g.Resize(phi::flatten_to_2d(Filter->dims(), 2));
-      constant(dev_ctx, filter_g, 0);
-      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
-        auto edge_set =
-            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
-        auto embeddings = Embeddings->Slice(batch_id, batch_id + 1)
-                              .Resize(embedding_slicedim);
-        auto out_grad =
-            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        phi::DenseTensor patch;
-        tree2col(dev_ctx, edge_set, embeddings, &patch, max_depth);
-        blas.MatMul(patch, true, out_grad, false, T(1.0), &f_g, T(1.0));
-      }
-    }
-    if (in_g) {
-      auto input_grad_dims = phi::slice_ddim(
-          in_g->dims(), 1, static_cast<int>(in_g->dims().size()));
-      in_g->mutable_data<T>(Embeddings->dims(), ctx.GetPlace());
-      constant(dev_ctx, in_g, 0);
-      for (int batch_id = 0; batch_id < batch_size; batch_id++) {
-        auto edge_set =
-            edges->Slice(batch_id, batch_id + 1).Resize(edge_set_slicedim);
-        auto out_grad =
-            out_g->Slice(batch_id, batch_id + 1).Resize(out_grad_dims);
-        auto in_grad =
-            in_g->Slice(batch_id, batch_id + 1).Resize(input_grad_dims);
-        phi::DenseTensor in_grad_temp;
-        col2tree(dev_ctx, edge_set, out_grad, &in_grad_temp, max_depth);
-        blas.MatMul(in_grad_temp, false, W, true, &in_grad);
-      }
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 3cfb53ea14b65..1b9b91e3efc41 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -97,8 +97,7 @@ register_unity_group(
   fill_constant_batch_size_like_op.cc
   fill_constant_op.cc
   fill_op.cc
-  fill_zeros_like_op.cc
-  filter_by_instag_op.cc)
+  fill_zeros_like_op.cc)
 register_unity_group(
   cc
   flatten_op.cc
@@ -203,7 +202,6 @@ register_unity_group(
   positive_negative_pair_op.cc
   prelu_op.cc
   print_op.cc
-  prroi_pool_op.cc
   psroi_pool_op.cc
   pull_box_extended_sparse_op.cc
   pull_box_sparse_op.cc
@@ -215,7 +213,6 @@ register_unity_group(
   quantize_op.cc
   mkldnn/quantize_mkldnn_op.cc
   queue_generator_op.cc
-  random_crop_op.cc
   range_op.cc
   rank_attention_op.cc
   rank_loss_op.cc
@@ -258,7 +255,6 @@ register_unity_group(
   slice_op.cc)
 register_unity_group(
   cc
-  space_to_depth_op.cc
   spectral_norm_op.cc
   split_lod_tensor_op.cc
   split_op.cc
@@ -283,7 +279,6 @@ register_unity_group(
   trace_op.cc
   transpose_op.cc
   mkldnn/transpose_mkldnn_op.cc
-  tree_conv_op.cc
   unbind_op.cc
   unfold_op.cc)
 register_unity_group(
@@ -328,7 +323,7 @@ register_unity_group(
   unbind_op.cu.cc
   unpool_op.cu.cc
   unsqueeze_op.cu.cc)
-register_unity_group(cc arg_max_op.cc arg_min_op.cc squared_l2_distance_op.cc)
+register_unity_group(cc arg_max_op.cc arg_min_op.cc)
 register_unity_group(
   cc
   linear_chain_crf_op.cc
@@ -441,8 +436,7 @@ register_unity_group(
   masked_select_op.cu
   lstmp_op.cu
   shuffle_channel_op.cu
-  softmax_cudnn_op.cu
-  squared_l2_distance_op.cu)
+  softmax_cudnn_op.cu)
 register_unity_group(
   cu
   conv_shift_op.cu
@@ -474,18 +468,11 @@ register_unity_group(
   partial_sum_op.cu
   pixel_shuffle_op.cu
   prelu_op.cu
-  prroi_pool_op.cu
   run_program_op.cu
   pull_box_extended_sparse_op.cu
   pull_box_sparse_op.cu)
-register_unity_group(
-  cu
-  random_crop_op.cu
-  range_op.cu
-  reverse_op.cu
-  partial_concat_op.cu
-  kldiv_loss_op.cu
-  instance_norm_op.cu)
+register_unity_group(cu range_op.cu reverse_op.cu partial_concat_op.cu
+                     kldiv_loss_op.cu instance_norm_op.cu)
 register_unity_group(
   cu
   roi_align_op.cu
@@ -509,7 +496,6 @@ register_unity_group(
   slice_op.cu)
 register_unity_group(
   cu
-  space_to_depth_op.cu
   spectral_norm_op.cu
   split_op.cu
   split_selected_rows_op.cu
@@ -517,14 +503,8 @@ register_unity_group(
   sum_op.cu
   temporal_shift_op.cu
   arg_max_op.cu)
-register_unity_group(
-  cu
-  row_conv_op.cu
-  tree_conv_op.cu
-  tril_triu_op.cu
-  unfold_op.cu
-  arg_min_op.cu
-  crop_tensor_op.cu)
+register_unity_group(cu row_conv_op.cu tril_triu_op.cu unfold_op.cu
+                     arg_min_op.cu crop_tensor_op.cu)
 register_unity_group(
   cu
   smooth_l1_loss_op.cu
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
index 7aa25386076e5..6aa04cf4a6896 100644
--- a/test/legacy_test/CMakeLists.txt
+++ b/test/legacy_test/CMakeLists.txt
@@ -523,9 +523,7 @@ set(TEST_OPS_WITH_GC
     test_mean_op
     test_pad2d_op
     test_scatter_op
-    test_slice_op
-    test_space_to_depth_op
-    test_squared_l2_distance_op)
+    test_slice_op)
 
 foreach(TEST_OP ${TEST_OPS_WITH_GC})
   list(REMOVE_ITEM TEST_OPS ${TEST_OP})
@@ -1047,7 +1045,6 @@ set_tests_properties(test_sigmoid_cross_entropy_with_logits_op
 set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 150)
 set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cond PROPERTIES TIMEOUT 120)
-set_tests_properties(test_space_to_depth_op PROPERTIES TIMEOUT 200)
 set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
 set_tests_properties(test_parallel_executor_seresnext_base_gpu
                      PROPERTIES TIMEOUT 120)
diff --git a/test/legacy_test/test_filter_by_instag_op.py b/test/legacy_test/test_filter_by_instag_op.py
deleted file mode 100644
index 211889feaa06b..0000000000000
--- a/test/legacy_test/test_filter_by_instag_op.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""This is unit test of Test filter_instag Op."""
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-"""This is Test Case 1"""
-
-
-class TestFilterByInstagOp(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-        x1 = np.zeros((36, 4), dtype=np.float64)
-        for i in range(36):
-            for j in range(4):
-                x1[i, j] = i
-        x1_lod = [[1, 2, 3, 4, 5, 6, 7, 8]]
-
-        x2 = np.array([[1], [2], [1], [2], [1], [2], [1], [2]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1, 1, 1, 1, 1]]
-
-        x3 = np.array([2]).astype('int64')
-
-        out = np.zeros((20, 4), dtype=np.float64)
-        out_lod = [[2, 4, 6, 8]]
-        start_num_lst = [1, 6, 15, 28]
-
-        ln = 0
-        for i in range(4):
-            start = start_num_lst[i]
-            len = out_lod[0][i]
-            for j in range(len):
-                cur = start + j
-                for k in range(4):
-                    out[ln, k] = cur
-                ln += 1
-
-        mmap = np.array([[0, 1, 2], [2, 6, 4], [6, 15, 6], [12, 28, 8]]).astype(
-            'int64'
-        )
-        mmap_lod = [[1, 1, 1, 1]]
-
-        loss_weight = np.array([[1], [1], [1], [1]]).astype('double')
-
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-
-        self.attrs = {'is_lod': True, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-"""This is Test Case 2"""
-
-
-class TestFilterByInstagOp2(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([1]).astype('int64')
-
-        out = np.zeros([2, 36]).astype('double')
-        out[0] = x1[1]
-        out[1] = x1[3]
-        out_lod = [[1, 1]]
-
-        mmap = np.array([[0, 1, 1], [1, 3, 1]]).astype('int64')
-        mmap_lod = [[1, 1]]
-
-        loss_weight = np.array([[1], [1]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': True, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-"""This is Test Case 3"""
-
-
-class TestFilterByInstagOp3(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('double')
-        x1_lod = [[1, 1, 1, 1]]
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': (x1, x1_lod),
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': True, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-"""This is Test Case 4"""
-
-
-class TestFilterByInstagOp4(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('double')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': False, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['Ins'], 'Out', no_grad_set={'Ins_tag', 'Filter_tag'})
-
-
-class TestFilterByInstagOp6(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('int64')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': False, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        pass
-
-
-class TestFilterByInstagOp7(OpTest):
-    def setUp(self):
-        self.op_type = 'filter_by_instag'
-
-        x1 = np.random.random((4, 36)).astype('int32')
-
-        x2 = np.array([[2], [1], [2], [1]]).astype('int64')
-        x2_lod = [[1, 1, 1, 1]]
-
-        x3 = np.array([3]).astype('int64')
-
-        out = np.zeros((1, 36)).astype('double')
-        out_lod = [[1]]
-
-        mmap = np.array([[0, 1, 1]]).astype('int64')
-        mmap_lod = [[1]]
-
-        loss_weight = np.array([[0]]).astype('double')
-        self.inputs = {
-            'Ins': x1,
-            'Ins_tag': (x2, x2_lod),
-            'Filter_tag': x3,
-        }
-        self.outputs = {
-            'Out': (out, out_lod),
-            'LossWeight': (loss_weight, mmap_lod),
-            'IndexMap': (mmap, mmap_lod),
-        }
-        self.attrs = {'is_lod': False, 'out_val_if_empty': 0}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        pass
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_random_crop_op.py b/test/legacy_test/test_random_crop_op.py
deleted file mode 100644
index 08355378207c1..0000000000000
--- a/test/legacy_test/test_random_crop_op.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestRandomCropOp(OpTest):
-    def setUp(self):
-        to_crop = np.array(
-            [[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]]] * 5
-        ).astype(np.int32)
-        self.possible_res = [
-            np.array([[1, 2, 3], [5, 6, 7]]).astype(np.int32),
-            np.array([[2, 3, 4], [6, 7, 8]]).astype(np.int32),
-            np.array([[5, 6, 7], [9, 10, 11]]).astype(np.int32),
-            np.array([[6, 7, 8], [10, 11, 12]]).astype(np.int32),
-        ]
-        self.op_type = "random_crop"
-        self.inputs = {'X': to_crop, 'Seed': np.array([10]).astype('int64')}
-        self.outputs = {'Out': np.array([]), 'SeedOut': np.array([])}
-        self.attrs = {'shape': [2, 3]}
-
-    def test_check_output(self):
-        self.check_output_customized(self.verify_output)
-
-    def verify_output(self, outs):
-        out = np.array(outs[1])
-        for ins in out[:]:
-            is_equal = [(ins == res).all() for res in self.possible_res]
-            self.assertIn(True, is_equal)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_space_to_depth_op.py b/test/legacy_test/test_space_to_depth_op.py
deleted file mode 100644
index c7cd6cae179db..0000000000000
--- a/test/legacy_test/test_space_to_depth_op.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-from paddle import base
-
-
-class TestSpaceToDepthOp(OpTest):
-    @staticmethod
-    def helper(in_, width, height, channel, batch, blocksize, forward, out_):
-        channel_out = channel // (blocksize * blocksize)
-        for b in range(batch):
-            for k in range(channel):
-                for j in range(height):
-                    for i in range(width):
-                        in_index = i + width * (j + height * (k + channel * b))
-                        channel2 = k % channel_out
-                        offset = k // channel_out
-                        width2 = i * blocksize + offset % blocksize
-                        height2 = j * blocksize + offset // blocksize
-                        out_index = width2 + width * blocksize * (
-                            height2
-                            + height * blocksize * (channel2 + channel_out * b)
-                        )
-                        if forward:
-                            out_[out_index] = in_[in_index]
-                        else:
-                            out_[in_index] = in_[out_index]
-
-    def setUp(self):
-        self.init_data()
-
-        self.op_type = "space_to_depth"
-        self.inputs = {"X": self.x}
-        self.helper(
-            self.x_1d,
-            self.x.shape[3],
-            self.x.shape[2],
-            self.x.shape[1],
-            self.x.shape[0],
-            self.blocksize,
-            self.forward,
-            self.out_1d,
-        )
-        self.out = np.reshape(self.out_1d, self.infered_shape)
-        self.attrs = {"blocksize": self.blocksize}
-        self.outputs = {"Out": self.out}
-
-    def init_data(self):
-        self.ori_shape = (32, 12, 6, 6)
-        self.infered_shape = (32, 48, 3, 3)
-        self.one_d_len = 32 * 48 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-    def test_check_output(self):
-        place = (
-            base.core.CUDAPlace(0)
-            if base.core.is_compiled_with_cuda()
-            else base.core.CPUPlace()
-        )
-        self.check_output_with_place(
-            place=place, atol=1e-5, no_check_set=None, equal_nan=False
-        )
-
-    def test_check_grad(self):
-        place = (
-            base.core.CUDAPlace(0)
-            if base.core.is_compiled_with_cuda()
-            else base.core.CPUPlace()
-        )
-        self.check_grad_with_place(place, ['X'], 'Out')
-
-
-class TestSpaceToDepthOpBasic(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 8, 6, 6)
-        self.infered_shape = (32, 32, 3, 3)
-        self.one_d_len = 32 * 32 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 8, 6, 6)
-        self.infered_shape = (32, 32, 3, 3)
-        self.one_d_len = 32 * 32 * 3 * 3
-
-        self.blocksize = 2
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 9, 6, 6)
-        self.infered_shape = (32, 81, 2, 2)
-        self.one_d_len = 32 * 81 * 2 * 2
-
-        self.blocksize = 3
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp):
-    def init_data(self):
-        self.ori_shape = (32, 9, 9, 6)
-        self.infered_shape = (32, 81, 3, 2)
-        self.one_d_len = 32 * 81 * 3 * 2
-
-        self.blocksize = 3
-        self.x = np.random.random(self.ori_shape).astype('float64')
-        self.x_1d = np.reshape(self.x, self.one_d_len)
-        self.out = np.zeros(self.infered_shape).astype('float64')
-        self.out_1d = np.reshape(self.out, self.one_d_len)
-        self.forward = 1
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/test/legacy_test/test_squared_l2_distance_op.py b/test/legacy_test/test_squared_l2_distance_op.py
deleted file mode 100644
index 579681ab0c098..0000000000000
--- a/test/legacy_test/test_squared_l2_distance_op.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-class TestSquaredL2DistanceOp_f0(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (5, 20)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (5, 20)).astype("float32"),
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-class TestSquaredL2DistanceOp_f1(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (1, 3)).astype("float32"),
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-class TestSquaredL2DistanceOp_f2(OpTest):
-    def setUp(self):
-        self.op_type = "squared_l2_distance"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 0.6, (2, 3, 4)).astype("float32"),
-            'Y': np.random.uniform(0.1, 0.6, (1, 3, 4)).astype("float32"),
-        }
-        sub_res = self.inputs['X'] - self.inputs['Y']
-        sub_res = sub_res.reshape((2, 3 * 4))
-        output = sub_res * sub_res
-        self.outputs = {
-            'sub_result': sub_res,
-            'Out': np.expand_dims(output.sum(1), 1),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/legacy_test/test_tree_conv_op.py b/test/legacy_test/test_tree_conv_op.py
deleted file mode 100644
index e05ee1a4d4cdf..0000000000000
--- a/test/legacy_test/test_tree_conv_op.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-from op_test import OpTest
-
-
-def collect_node_patch(og, max_depth):
-    """
-    The naive method to construct patches
-    :param og: original graph
-    :param max_depth: the depth of convolution filters
-    :return: convolution patches
-    """
-
-    def gen(node, max_depth):
-        collected = [(node, 1, 1, 0, max_depth)]
-
-        def recurse_helper(node, depth):
-            if depth > max_depth:
-                return
-            l = len(og[node])
-            for idx, c in enumerate(og[node], 1):
-                if depth + 1 < max_depth:
-                    collected.append((c, idx, l, depth + 1, max_depth))
-                    recurse_helper(c, depth + 1)
-
-        recurse_helper(node, 0)
-        return collected
-
-    res = []
-    for u in range(1, len(og)):
-        lis = gen(u, max_depth)
-        if len(lis) > 0:
-            res.append(lis)
-    return res
-
-
-class TestTreeConvOp(OpTest):
-    def setUp(self):
-        self.n = 17
-        self.fea_size = 3
-        self.output_size = 1
-        self.max_depth = 2
-        self.batch_size = 2
-        self.num_filters = 1
-        adj_array = [
-            1,
-            2,
-            1,
-            3,
-            1,
-            4,
-            1,
-            5,
-            2,
-            6,
-            2,
-            7,
-            2,
-            8,
-            4,
-            9,
-            4,
-            10,
-            5,
-            11,
-            6,
-            12,
-            6,
-            13,
-            9,
-            14,
-            9,
-            15,
-            9,
-            16,
-            9,
-            17,
-        ]
-        adj = np.array(adj_array).reshape((1, self.n - 1, 2)).astype('int32')
-        adj = np.tile(adj, (self.batch_size, 1, 1))
-        self.op_type = 'tree_conv'
-        vectors = np.random.random(
-            (self.batch_size, self.n, self.fea_size)
-        ).astype('float64')
-        self.inputs = {
-            'EdgeSet': adj,
-            'NodesVector': vectors,
-            'Filter': np.random.random(
-                (self.fea_size, 3, self.output_size, self.num_filters)
-            ).astype('float64'),
-        }
-        self.attrs = {'max_depth': self.max_depth}
-        vectors = []
-        for i in range(self.batch_size):
-            vector = self.get_output_naive(i)
-            vectors.append(vector)
-        self.outputs = {
-            'Out': np.array(vectors).astype('float64'),
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ['NodesVector', 'Filter'], 'Out', max_relative_error=0.5
-        )
-
-    def get_output_naive(self, batch_id):
-        og = [[] for i in range(1, self.n + 2)]
-        st = np.array(self.inputs['EdgeSet'][batch_id]).tolist()
-        for e in st:
-            og[e[0]].append(e[1])
-        patches = collect_node_patch(og, self.max_depth)
-        W = np.array(self.inputs['Filter']).astype('float64')
-        W = np.transpose(W, axes=[1, 0, 2, 3])
-        vec = []
-        for i, patch in enumerate(patches, 1):
-            result = np.zeros((1, W.shape[2], W.shape[3]))
-            for v in patch:
-                eta_t = float(v[4] - v[3]) / float(v[4])
-                eta_l = (1.0 - eta_t) * (
-                    0.5 if v[2] == 1 else float(v[1] - 1.0) / float(v[2] - 1.0)
-                )
-                eta_r = (1.0 - eta_t) * (1.0 - eta_l)
-                x = self.inputs['NodesVector'][batch_id][v[0] - 1]
-                eta = (
-                    np.array([eta_l, eta_r, eta_t])
-                    .reshape((3, 1))
-                    .astype('float64')
-                )
-                Wconvi = np.tensordot(eta, W, axes=([0], [0]))
-                x = np.array(x).reshape((1, 1, self.fea_size))
-                res = np.tensordot(x, Wconvi, axes=2)
-                result = result + res
-            vec.append(result)
-        vec = np.concatenate(vec, axis=0)
-        vec = np.concatenate(
-            [
-                vec,
-                np.zeros(
-                    (self.n - vec.shape[0], W.shape[2], W.shape[3]),
-                    dtype='float64',
-                ),
-            ],
-            axis=0,
-        )
-        return vec
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/test/white_list/check_shape_white_list.py b/test/white_list/check_shape_white_list.py
index db5a710867277..c01909ba97f9d 100644
--- a/test/white_list/check_shape_white_list.py
+++ b/test/white_list/check_shape_white_list.py
@@ -23,8 +23,6 @@
     'matmul',
     'scatter',
     'soft_relu',
-    'squared_l2_distance',
-    'tree_conv',
     'cvm',
     'cudnn_lstm',
     'rnn',
diff --git a/test/white_list/compile_vs_runtime_white_list.py b/test/white_list/compile_vs_runtime_white_list.py
index a00c1a720aa5c..0c74eb327a853 100644
--- a/test/white_list/compile_vs_runtime_white_list.py
+++ b/test/white_list/compile_vs_runtime_white_list.py
@@ -29,7 +29,6 @@
     'gru',
     'rpn_target_assign',
     'retinanet_target_assign',
-    'filter_by_instag',
     'im2sequence',
     'generate_proposal_labels',
     'detection_map',
diff --git a/test/white_list/no_grad_set_white_list.py b/test/white_list/no_grad_set_white_list.py
index 33960cf4c64d3..81bb85bb784d3 100644
--- a/test/white_list/no_grad_set_white_list.py
+++ b/test/white_list/no_grad_set_white_list.py
@@ -44,7 +44,6 @@
     'elementwise_pow',
     'elementwise_fmin',
     'elementwise_fmax',
-    'filter_by_instag',
     'fused_elemwise_activation',
     'fused_emb_seq_pool',
     'fused_embedding_seq_pool',
diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py
index 49b501e765b54..b3855c417ec40 100644
--- a/test/white_list/op_accuracy_white_list.py
+++ b/test/white_list/op_accuracy_white_list.py
@@ -68,7 +68,6 @@
     'smooth_l1_loss',
     'softmax',
     'spectral_norm',
-    'squared_l2_distance',
     'squared_l2_norm',
     'tanh',
     'mish',
diff --git a/tools/enforce/count_enforce_by_file.sh b/tools/enforce/count_enforce_by_file.sh
index fafc3516904d8..c79d486c62838 100644
--- a/tools/enforce/count_enforce_by_file.sh
+++ b/tools/enforce/count_enforce_by_file.sh
@@ -53,7 +53,6 @@ fi
 FILE_WHITE_LIST="\
     box_clip_op.cc \
     box_clip_op.h \
-    random_crop_op.h \
     elementwise_op_function.cu.h \
     fused_elemwise_activation_op.cc \
     auc_op.cu \
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index c22938e27d150..14f5e4c4bd1ec 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -298,7 +298,6 @@
     'test_ps_dispatcher',
     'test_analyzer_rnn2',
     'test_multi_gru_seq_fuse_pass',
-    'test_filter_by_instag_op',
     'test_switch',
     'test_matmul_transpose_reshape_fuse_pass',
     'test_mkldnn_caching',
@@ -915,7 +914,6 @@
     'test_transformer',
     'test_for_enumerate',
     'test_variable_trans_func',
-    'test_squared_l2_distance_op',
     'test_quantize_transpiler_v2',
     'test_im2sequence_op',
     'test_reader_reset',
@@ -1101,7 +1099,6 @@
     'test_empty_like_op',
     'test_imperative_layer_children',
     'nccl_op_test',
-    'test_tree_conv_op',
     'test_share_data_op',
     'test_ir_memory_optimize_transformer',
     'test_math_op_patch',
@@ -1382,7 +1379,6 @@
     'test_complex_abs',
     'test_gradient_accmulator',
     'test_instance_norm_op_v2',
-    'test_random_crop_op',
     'test_mobile_net',
     'test_parallel_executor_transformer',
     'test_tensor_scalar_type_promotion_dynamic',
@@ -1530,7 +1526,6 @@
     'test_imperative_transformer_sorted_gradient',
     'test_bicubic_interp_v2_op',
     'test_rank_attention_op',
-    'test_space_to_depth_op',
     'test_image_classification',
     'test_custom_relu_op_setup',
     'test_sgd_op',
@@ -1870,7 +1865,6 @@
     'test_fleet',
     'test_flags_use_mkldnn',
     'test_flags_mkldnn_ops_on_off',
-    'test_filter_by_instag_op',
     'test_fetch_var',
     'test_fetch_handler',
     'test_feed_fetch_method',
@@ -2659,7 +2653,6 @@
     'test_unfold_op',
     'test_conv_bn_fuse_pass',
     'test_truncated_gaussian_random_op',
-    'test_tree_conv_op',
     'test_traced_layer_err_msg',
     'test_unique_with_counts',
     'test_auc_single_pred_op',
@@ -2691,7 +2684,6 @@
     'test_optimizer',
     'test_deformable_conv_op',
     'test_py_reader_push_pop',
-    'test_random_crop_op',
     'test_shuffle_channel_op',
     'test_center_loss',
     'test_temporal_shift_op',
@@ -2708,13 +2700,10 @@
     'test_top_k_op',
     'test_batch_fc_op',
     'test_tensor_scalar_type_promotion_static',
-    'test_squared_l2_distance_op',
     'test_bicubic_interp_op',
     'test_spp_op',
-    'test_space_to_depth_op',
     'test_callbacks',
     'test_sigmoid_focal_loss_op',
-    'test_collect_fpn_proposals_op',
     'test_sequence_unpad_op',
     'test_conv1d_transpose_layer',
     'test_sequence_pool',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index a7e7ad08e3ab1..228218e46ecf4 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -39,8 +39,6 @@
     'test_sequence_scatter_op',
     'test_sequence_slice_op',
     'test_slice_op',
-    'test_space_to_depth_op',
-    'test_squared_l2_distance_op',
     'test_accuracy_op',
     'test_activation_nn_grad',
     'test_adadelta_op',
@@ -202,7 +200,6 @@
     'test_fill_constant_op',
     'test_fill_op',
     'test_fill_zeros_like_op',
-    'test_filter_by_instag_op',
     'test_flatten2_op',
     'test_flatten_contiguous_range_op',
     'test_flatten_op',
@@ -412,7 +409,6 @@
     'test_queue',
     'test_randint_op',
     'test_randn_op',
-    'test_random_crop_op',
     'test_randperm_op',
     'test_range',
     'test_rank_loss_op',
@@ -492,7 +488,6 @@
     'test_trace_op',
     'test_trainable',
     'test_transpose_op',
-    'test_tree_conv_op',
     'test_tril_triu_op',
     'test_trilinear_interp_op',
     'test_trilinear_interp_v2_op',
diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh
index e7c05f2768a83..2b8073c8b5ccf 100644
--- a/tools/windows/run_unittests.sh
+++ b/tools/windows/run_unittests.sh
@@ -217,7 +217,6 @@ long_time_test="^test_gru_op$|\
 ^test_gather_op$|\
 ^test_gather_nd_op$|\
 ^test_sequence_conv$|\
-^test_space_to_depth_op$|\
 ^test_activation_nn_grad$|\
 ^test_activation_op$|\
 ^test_bicubic_interp_v2_op$|\