diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 0294e1ca54b437..c089e4f0c13f95 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 
 #include "glog/logging.h"
 
+PHI_DECLARE_bool(enable_new_ir_in_executor);
+
 namespace phi {
 class DenseTensor;
 }  // namespace phi
@@ -34,16 +36,30 @@ void SetFeedVariable(Scope* scope,
   // If var_name Variable is not found in GlobalScope, a new variable will
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
-  Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
-  if (index >= feed_inputs.size()) {
-    feed_inputs.resize(index + 1);
+  if (FLAGS_enable_new_ir_in_executor) {
+    // shared data with input tensor
+    auto inner_var_name = var_name + "_" + std::to_string(index);
+    auto feed_ele = scope->Var(inner_var_name);
+    if (!feed_ele->IsType<phi::DenseTensor>()) {
+      VLOG(3) << "Reset " << inner_var_name << " to phi::DenseTensor";
+      feed_ele->Clear();
+    }
+    auto val = feed_ele->GetMutable<phi::DenseTensor>();
+    val->ShareDataWith(input);
+    // set lod
+    val->set_lod(input.lod());
+  } else {
+    Variable* g_feed_value = scope->Var(var_name);
+    auto& feed_inputs = *(g_feed_value->GetMutable<FeedList>());
+    if (index >= feed_inputs.size()) {
+      feed_inputs.resize(index + 1);
+    }
+    // shared data with input tensor
+    auto& val = PADDLE_GET(phi::DenseTensor, feed_inputs[index]);
+    val.ShareDataWith(input);
+    // set lod
+    val.set_lod(input.lod());
   }
-  // shared data with input tensor
-  auto& val = PADDLE_GET(phi::DenseTensor, feed_inputs[index]);
-  val.ShareDataWith(input);
-  // set lod
-  val.set_lod(input.lod());
 }
 
 void SetFeedVariable(Scope* scope,
diff --git a/paddle/fluid/framework/new_executor/standalone_executor.cc b/paddle/fluid/framework/new_executor/standalone_executor.cc
index f792dd40d49944..448d5712ecf3cd 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor.cc
@@ -62,11 +62,11 @@ StandaloneExecutor::StandaloneExecutor(const platform::Place& place,
     execution_config.skip_gc_vars = job->SkipGcVars();
 
     // TODO(phlrain) we only support cpu for now
-    if (FLAGS_enable_new_ir_in_executor && platform::is_cpu_place(place)) {
+    if (FLAGS_enable_new_ir_in_executor) {
       VLOG(6) << "begin to translate" << std::endl;
       auto base_program = paddle::TranslateLegacyProgramToProgram(*program);
       auto kernel_program =
-          paddle::dialect::PdOpLowerToKernelPass(base_program.get());
+          paddle::dialect::PdOpLowerToKernelPass(base_program.get(), place);
       interpretercores_.emplace_back(std::make_shared<InterpreterCore>(
           place_, std::move(kernel_program), scope_, execution_config));
     } else {
diff --git a/paddle/fluid/ir/dialect/pd_op.yaml b/paddle/fluid/ir/dialect/pd_op.yaml
index a7d1cb14e35e5c..903584d57bcc66 100644
--- a/paddle/fluid/ir/dialect/pd_op.yaml
+++ b/paddle/fluid/ir/dialect/pd_op.yaml
@@ -227,3 +227,30 @@
   inplace: null
   view: null
   backward: null
+
+
+- name: shaddow_feed
+  inputs:
+  - typename: Tensor
+    name: x
+    optional: false
+    no_need_buffer: false
+    data_transform: {}
+  attrs: []
+  outputs:
+    - {typename: Tensor, name: out, optional: false, intermediate: false}
+  no_need_buffer: null
+  data_transform: null
+  infer_meta:
+    func: UnchangedInferMeta
+    param: [x]
+  kernel:
+    func: [shaddow_feed]
+    param: [x]
+    backend: null
+    layout: null
+    data_type: null
+    dispatch: {fetch: null}
+    force_backend: null
+  inplace: null
+  backward: null
diff --git a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
index ad3a804eac9116..1a880210afbe10 100644
--- a/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
+++ b/paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.cc
@@ -193,26 +193,13 @@ void HandleForSpecialOp(
 
   if (op_name == "pd.feed") {
     auto value = op->result(0);
-    auto var = CreateVar(value,
-                         inner_scope,
-                         var_name_prefix,
-                         false,
-                         value_2_var_name,
-                         variable_2_var_name,
-                         var_name_2_id,
-                         variable_list);
-    // TODO(phlrain): need to update here, support StringTensor
-    auto out_tensor = var->GetMutable<phi::DenseTensor>();
+    VLOG(6) << "link feed output to feed in variable" << inner_scope;
 
-    auto feed_var =
-        const_cast<paddle::framework::Scope*>(inner_scope->root())->Var("feed");
-    VLOG(6) << "Create var: feed in scope " << inner_scope->root();
     int index =
         op->attributes().at("col").dyn_cast<ir::Int32Attribute>().data();
-    auto feed_list = feed_var->Get<paddle::framework::FeedList>();
-    auto& in_tensor = (PADDLE_GET(phi::DenseTensor, feed_list.at(index)));
-    out_tensor->ShareDataWith(in_tensor);
-    out_tensor->set_lod(in_tensor.lod());
+
+    auto feed_var_name = "feed_" + std::to_string(index);
+    value_2_var_name->emplace(value, feed_var_name);
   }
 
   if (op_name == "builtin.combine") {
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
index a7f209e7e4c319..d55ce6b24f9cf2 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.cc
@@ -53,7 +53,7 @@ phi::KernelKey GetKernelKey(
     ir::Operation* op,
     const phi::Place& place,
     const std::unordered_map<ir::Value, ir::OpResult>& map_value_pair,
-    const dialect::OpYamlInfoParser* op_info_parser = nullptr) {
+    std::unique_ptr<dialect::OpYamlInfoParser> op_info_parser = nullptr) {
   if (op->name() == "pd.feed") {
     // NOTE, for now feed op don't need a kernel, so the data type from Op
     // Result the next op use base program datatype
@@ -223,11 +223,11 @@ phi::KernelKey GetKernelKey(
   return res;
 }
 
-std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
+std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
+                                                   phi::Place place) {
   auto program = std::make_unique<ir::Program>(ir::IrContext::Instance());
 
   auto block = prog->block();
-  phi::Place cpu_place(phi::AllocationType::CPU);
 
   ir::IrContext* ctx = ir::IrContext::Instance();
   ctx->GetOrRegisterDialect<paddle::dialect::PaddleDialect>();
@@ -244,14 +244,19 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
     VLOG(6) << "op name " << (*it)->name();
     paddle::dialect::OpYamlInfoInterface op_info_interface =
         (*it)->dyn_cast<paddle::dialect::OpYamlInfoInterface>();
-    OpYamlInfoParser* op_info_parser = nullptr;
+    std::unique_ptr<OpYamlInfoParser> op_info_parser;
     if (op_info_interface) {
-      op_info_parser = new OpYamlInfoParser(op_info_interface.GetOpInfo());
+      op_info_parser.reset(new OpYamlInfoParser(op_info_interface.GetOpInfo()));
     }
+
+    std::string kernel_fn_str;
+    if (op_info_parser != nullptr) {
+      kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
+    }
+
     auto kernel_key =
-        GetKernelKey(*it, cpu_place, map_value_pair, op_info_parser);
+        GetKernelKey(*it, place, map_value_pair, std::move(op_info_parser));
     VLOG(6) << "kernel type " << kernel_key;
-    // create new Op
 
     // only for single output
     // need update new kernel key layout and data tyep
@@ -305,11 +310,6 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
     // constuct input
     std::vector<ir::OpResult> vec_inputs;
 
-    std::string kernel_fn_str;
-    if (op_info_parser != nullptr) {
-      kernel_fn_str = op_info_parser->OpRuntimeInfo().kernel_func[0];
-    }
-
     if ((*it)->num_operands() > 0) {
       for (size_t i = 0; i < (*it)->num_operands(); ++i) {
         auto cur_in = (*it)->operand(i);
@@ -404,6 +404,35 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog) {
     }
 
     program->block()->push_back(op);
+
+    if ((*it)->name() == "pd.feed" && platform::is_gpu_place(place)) {
+      // add shaddow feed op
+      phi::KernelKey shaddow_key{
+          phi::Backend::GPU,
+          phi::DataLayout::ANY,
+          TransToPhiDataType(
+              (*it)->result(0).type().dyn_cast<DenseTensorType>().dtype())};
+      std::unordered_map<std::string, ir::Attribute> attr_map{
+          {"op_name", ir::StrAttribute::get(ctx, "pd.shaddow_feed")},
+          {"kernel_name", ir::StrAttribute::get(ctx, "shaddow_feed")},
+          {"kernel_key", dialect::KernelAttribute::get(ctx, shaddow_key)}};
+
+      auto out_type = paddle::dialect::AllocatedDenseTensorType::get(
+          ctx,
+          phi::TransToPhiPlace(shaddow_key.backend()),
+          (*it)->result(0).type().dyn_cast<dialect::DenseTensorType>());
+
+      ir::Operation* shaddow_op =
+          ir::Operation::Create({op->result(0)}, attr_map, {out_type}, op_info);
+
+      map_op_pair[*it] = shaddow_op;
+      program->block()->push_back(shaddow_op);
+      if ((*it)->num_results() > 0) {
+        for (size_t i = 0; i < shaddow_op->num_results(); ++i) {
+          map_value_pair[(*it)->result(i)] = shaddow_op->result(i);
+        }
+      }
+    }
   }
 
   return program;
diff --git a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h
index 415ce18bb0756a..3e4848720f4cec 100644
--- a/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h
+++ b/paddle/fluid/ir/transforms/pd_op_to_kernel_pass.h
@@ -14,11 +14,13 @@
 #pragma once
 
 #include "paddle/ir/core/program.h"
+#include "paddle/phi/common/place.h"
 
 namespace paddle {
 namespace dialect {
 
-std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog);
+std::unique_ptr<ir::Program> PdOpLowerToKernelPass(
+    ir::Program* prog, phi::Place place = phi::CPUPlace());
 
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
index 5d7f5d747eb3fa..342ad6a334cc30 100644
--- a/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
+++ b/paddle/phi/kernels/cpu/feed_with_place_kernel.cc
@@ -16,6 +16,7 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/feed_with_place_impl.h"
 
 namespace phi {
 
@@ -26,11 +27,20 @@ void FeedWithPlaceKernel(const Context& ctx,
                          DenseTensor* out) {}
 
 }  // namespace phi
-PD_REGISTER_KERNEL(feed_with_place,
+
+PD_REGISTER_KERNEL(
+    feed_with_place, CPU, ALL_LAYOUT, phi::FeedWithPlaceKernel, float) {}
+
+PD_REGISTER_KERNEL(shaddow_feed,
                    CPU,
                    ALL_LAYOUT,
-                   phi::FeedWithPlaceKernel,
+                   phi::ShaddowFeedKernel,
+                   bool,
                    float,
                    int32_t,
                    int64_t,
-                   double) {}
+                   double,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::complex64,
+                   phi::complex128) {}
diff --git a/paddle/phi/kernels/cpu/fetch_kernel.cc b/paddle/phi/kernels/cpu/fetch_kernel.cc
index 672ceba1b84b35..cdd42c9ef83243 100644
--- a/paddle/phi/kernels/cpu/fetch_kernel.cc
+++ b/paddle/phi/kernels/cpu/fetch_kernel.cc
@@ -16,17 +16,8 @@
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/fetch_impl.h"
 
-namespace phi {
-
-template <typename T, typename Context>
-void FetchKernel(const Context& dev_ctx,
-                 const DenseTensor& x,
-                 DenseTensor* out) {
-  phi::Copy(dev_ctx, x, phi::CPUPlace(), true, out);
-  out->set_lod(x.lod());
-}
-}  // namespace phi
 PD_REGISTER_KERNEL(fetch,
                    CPU,
                    ALL_LAYOUT,
diff --git a/paddle/phi/kernels/feed_with_place_kernel.h b/paddle/phi/kernels/feed_with_place_kernel.h
index 624992da5432c9..4e8e9063c8d9b9 100644
--- a/paddle/phi/kernels/feed_with_place_kernel.h
+++ b/paddle/phi/kernels/feed_with_place_kernel.h
@@ -24,4 +24,9 @@ void FeedWithPlaceKernel(const Context& ctx,
                          phi::DataType data_type,
                          DenseTensor* out);
 
+template <typename T, typename Context>
+void ShaddowFeedKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/feed_with_place_kernel.cu b/paddle/phi/kernels/gpu/feed_with_place_kernel.cu
new file mode 100644
index 00000000000000..07d4c8719da2c7
--- /dev/null
+++ b/paddle/phi/kernels/gpu/feed_with_place_kernel.cu
@@ -0,0 +1,33 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/feed_with_place_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/feed_with_place_impl.h"
+
+PD_REGISTER_KERNEL(shaddow_feed,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::ShaddowFeedKernel,
+                   bool,
+                   float,
+                   int32_t,
+                   int64_t,
+                   double,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::complex64,
+                   phi::complex128) {}
diff --git a/paddle/phi/kernels/gpu/fetch_kernel.cu b/paddle/phi/kernels/gpu/fetch_kernel.cu
new file mode 100644
index 00000000000000..b132ae975b815a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/fetch_kernel.cu
@@ -0,0 +1,37 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/fetch_kernel.h"
+
+#include "paddle/phi/kernels/impl/fetch_impl.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_REGISTER_KERNEL(fetch,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FetchKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   phi::float16,
+                   phi::bfloat16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   bool) {}
diff --git a/paddle/phi/kernels/impl/feed_with_place_impl.h b/paddle/phi/kernels/impl/feed_with_place_impl.h
new file mode 100644
index 00000000000000..a7602c2d37927c
--- /dev/null
+++ b/paddle/phi/kernels/impl/feed_with_place_impl.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShaddowFeedKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       DenseTensor* out) {
+  ctx.template Alloc<T>(out);
+  if (x.place() == out->place()) {
+    out->ShareDataWith(x);
+    out->set_lod(x.lod());
+  } else {
+    phi::Copy<Context>(ctx, x, ctx.GetPlace(), true, out);
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/fetch_impl.h b/paddle/phi/kernels/impl/fetch_impl.h
new file mode 100644
index 00000000000000..d90a813e4a16b3
--- /dev/null
+++ b/paddle/phi/kernels/impl/fetch_impl.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FetchKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
+  phi::Copy(ctx, x, phi::CPUPlace(), true, out);
+}
+
+}  // namespace phi
diff --git a/test/ir/new_ir/test_standalone_new_ir.py b/test/ir/new_ir/test_standalone_new_ir.py
index 9be4e07fddc77d..c67370b2e0a2fc 100644
--- a/test/ir/new_ir/test_standalone_new_ir.py
+++ b/test/ir/new_ir/test_standalone_new_ir.py
@@ -24,7 +24,11 @@
 
 class TestNewIr(unittest.TestCase):
     def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
         exe = paddle.static.Executor(place)
 
         main_program = paddle.static.Program()
@@ -44,7 +48,11 @@ def test_with_new_ir(self):
 
 class TestCombineOp(unittest.TestCase):
     def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
         exe = paddle.static.Executor(place)
 
         main_program = paddle.static.Program()
@@ -64,7 +72,11 @@ def test_with_new_ir(self):
 
 class TestFeedOp(unittest.TestCase):
     def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
         exe = paddle.static.Executor(place)
 
         main_program = paddle.static.Program()
@@ -91,6 +103,8 @@ def test_with_new_ir(self):
 
 class TestSelectedRows(unittest.TestCase):
     def test_with_new_ir(self):
+        # TODO(phlrain): support selected rows in GPU
+        # place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
         place = paddle.CPUPlace()
         exe = paddle.static.Executor(place)
 
@@ -113,7 +127,11 @@ def test_with_new_ir(self):
 
 class TestAddGradOp(unittest.TestCase):
     def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
         exe = paddle.static.Executor(place)
 
         main_program = paddle.static.Program()
@@ -143,7 +161,11 @@ def test_with_new_ir(self):
 
 class TestSplitOp(unittest.TestCase):
     def test_with_new_ir(self):
-        place = paddle.CPUPlace()
+        place = (
+            paddle.CUDAPlace(0)
+            if paddle.is_compiled_with_cuda()
+            else paddle.CPUPlace()
+        )
         exe = paddle.static.Executor(place)
 
         main_program = paddle.static.Program()