diff --git a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
index da2c063d9c00d..d8d87328bd3f8 100644
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -41,7 +41,7 @@ std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
   for (const ir::Expr& e : indices) {
     // Whether we have to convert other types, like const numbers to Var?
     if (e.As<ir::_Var_>() != nullptr) {
-      ir::Expr copy_e = optim::IRCopy(e);
+      ir::Expr copy_e = ir::ir_utils::IRCopy(e);
       ir::_Var_* var_ref = copy_e.As<ir::_Var_>();
       result.emplace_back(ir::Var(var_ref));
     }
diff --git a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
index db2d3f62ed6a9..1228d5abaf072 100644
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
@@ -218,7 +218,7 @@ void FeatureExtractor::Visit(const For *x) {
 }
 
 void FeatureExtractor::Visit(const PolyFor *x) {
-  Expr copy = optim::IRCopy(Expr(x));
+  Expr copy = ir::ir_utils::IRCopy(Expr(x));
   feature_.IntoLoopBlock();
   optim::TransformPolyForToFor(&copy);
   ir::For *loop = copy.As<For>();
diff --git a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
index 9a7bf9d568bc3..6d4e8a70cc17b 100644
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -56,7 +56,7 @@ ir::IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs,
                               const std::string& task_key) {
   std::vector<Expr> exprs;
   for (auto&& func : lowered_funcs) {
-    exprs.emplace_back(optim::IRCopy(func->body));
+    exprs.emplace_back(ir::ir_utils::IRCopy(func->body));
   }
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   task_registry->Regist(task_key, ir::ModuleExpr(exprs));
diff --git a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
index 1881697237e82..5bb351767e8cb 100644
--- a/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
+++ b/paddle/cinn/auto_schedule/search_strategy/evolutionary_search.cc
@@ -134,7 +134,7 @@ std::vector<SearchState> EvolutionarySearch::GetTopKCandidatesFromDatabase(
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   for (auto&& record : records) {
     ir::IRSchedule ir_sch(
-        optim::IRCopy(task_registry->Get(task_key)->module_expr),
+        ir::ir_utils::IRCopy(task_registry->Get(task_key)->module_expr),
         utils::ForkRandomState(&rand_seed_));
     ir::ScheduleDesc::ReplayWithProto(record.trace, &ir_sch);
     results.emplace_back(SearchState(std::move(ir_sch), record.predicted_cost));
@@ -181,9 +181,9 @@ SearchState EvolutionarySearch::CrossOver(const SearchState& state1,
 
   for (size_t i = 0; i < father_exprs.size(); ++i) {
     if (utils::SampleUniformInt(0, 2, &rand_seed_) == 0) {
-      cross_over_exprs.push_back(optim::IRCopy(father_exprs[i]));
+      cross_over_exprs.push_back(ir::ir_utils::IRCopy(father_exprs[i]));
     } else {
-      cross_over_exprs.push_back(optim::IRCopy(mother_exprs[i]));
+      cross_over_exprs.push_back(ir::ir_utils::IRCopy(mother_exprs[i]));
     }
   }
   auto res = SearchState(ir::IRSchedule(ir::ModuleExpr(cross_over_exprs),
@@ -217,7 +217,7 @@ SearchState EvolutionarySearch::Mutate(
   const auto& task_key = tune_task_.serialized_key;
   InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
   ir::IRSchedule new_ir_sch(
-      optim::IRCopy(task_registry->Get(task_key)->module_expr),
+      ir::ir_utils::IRCopy(task_registry->Get(task_key)->module_expr),
       utils::ForkRandomState(rand_seed));
   new_trace.Replay(&new_ir_sch, true);
   ApplyPostScheduleRules(&new_ir_sch, post_schedule_rules_);
diff --git a/paddle/cinn/auto_schedule/task/task_optimizer.cc b/paddle/cinn/auto_schedule/task/task_optimizer.cc
index ea1b18c764533..d76797d9953ec 100644
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
@@ -247,7 +247,7 @@ TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
   auto& optimized_funcs = result.functions;
   auto& best_cost = result.cost;
   // use initial lowered function as default result
-  optimized_funcs = optim::IRCopy(task_->lowered_funcs);
+  optimized_funcs = ir::ir_utils::IRCopy(task_->lowered_funcs);
   if (options.num_measure_trials ==
       0) {  // no need to measure and simply return the best searched
     std::vector<MeasureInput> measure_candidates;
@@ -347,7 +347,7 @@ std::vector<SearchState> TaskOptimizer::SearchOneRound(
     CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size())
         << "RuntimeError: Expr size is not equal to LoweredFunc size in "
            "TaskOptimizer";
-    auto init_funcs = optim::IRCopy(task_->lowered_funcs);
+    auto init_funcs = ir::ir_utils::IRCopy(task_->lowered_funcs);
     std::vector<ir::LoweredFunc> valid_funcs;
     for (size_t j = 0; j < best_exprs.size(); ++j) {
       auto updated_f =
diff --git a/paddle/cinn/auto_schedule/task/task_registry.h b/paddle/cinn/auto_schedule/task/task_registry.h
index 7cff52c220461..22eb49fa2c0a1 100644
--- a/paddle/cinn/auto_schedule/task/task_registry.h
+++ b/paddle/cinn/auto_schedule/task/task_registry.h
@@ -63,7 +63,7 @@ class InitialTaskRegistry : public Registry<InitialTaskInfo> {
     std::lock_guard<std::mutex> guard(registering_mutex);
     if (fmap_.count(task_key) == 0) {
       InitialTaskInfo* task_info =
-          new InitialTaskInfo(task_key, optim::IRCopy(module_expr));
+          new InitialTaskInfo(task_key, ir::ir_utils::IRCopy(module_expr));
       __REGISTER__(task_key, task_info);
     }
   }
diff --git a/paddle/cinn/backends/codegen_cuda_util.h b/paddle/cinn/backends/codegen_cuda_util.h
index 1b406ef2457e1..6ae64cbb36172 100644
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
@@ -127,7 +127,7 @@ struct CollectHostFunctionVisitor : public ir::IRMutator<> {
   }
 
   Expr CreateDeviceFunctionGivenDeviceKernel(Expr expr) {
-    auto copied = optim::IRCopy(expr);
+    auto copied = ir::ir_utils::IRCopy(expr);
     auto* lowered_func = copied.as_lowered_func();
     lowered_func->name = GenDeviceKernelName(lowered_func->name);
     return copied;
diff --git a/paddle/cinn/common/cas.cc b/paddle/cinn/common/cas.cc
index bf1c9092ed5eb..727c3b98e4ced 100644
--- a/paddle/cinn/common/cas.cc
+++ b/paddle/cinn/common/cas.cc
@@ -1584,7 +1584,7 @@ bool CASasSymbol(Expr expr) {
 
 Expr ConvertCinnToCAS(Expr expr) {
   VLOG(7) << "Begin ConvertCinnToCAS " << expr;
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
   struct Mutator : public ir::IRMutator<ir::Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
     void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
@@ -1710,7 +1710,7 @@ Expr ConvertCinnToCAS(Expr expr) {
  * simplify the condition ensures correctness, though not sufficient.
  */
 Expr ReplaceMinToConstant(Expr expr) {
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
   struct Mutator : public ir::IRMutator<ir::Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
     void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
@@ -1727,10 +1727,10 @@ Expr ReplaceMinToConstant(Expr expr) {
       auto min_b = op->b();
       if (min_a.is_constant() && !min_b.is_constant()) {
         CHECK(min_a->type().is_integer());
-        *expr = optim::IRCopy(min_a);
+        *expr = ir::ir_utils::IRCopy(min_a);
       } else if (min_b.is_constant() && !min_a.is_constant()) {
         CHECK(min_b->type().is_integer());
-        *expr = optim::IRCopy(min_b);
+        *expr = ir::ir_utils::IRCopy(min_b);
       }
     }
   };
@@ -1743,7 +1743,7 @@ Expr ReplaceMinToConstant(Expr expr) {
  * constant value and 1 inconstant value, return the constant max value.
  */
 Expr ReplaceMaxToConstant(Expr expr) {
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
   struct Mutator : public ir::IRMutator<ir::Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
     void Visit(Expr* expr) { ir::IRMutator<>::Visit(expr, expr); }
@@ -1760,10 +1760,10 @@ Expr ReplaceMaxToConstant(Expr expr) {
       auto max_b = op->b();
       if (max_a.is_constant() && !max_b.is_constant()) {
         CHECK(max_a->type().is_integer());
-        *expr = optim::IRCopy(max_a);
+        *expr = ir::ir_utils::IRCopy(max_a);
       } else if (max_b.is_constant() && !max_a.is_constant()) {
         CHECK(max_b->type().is_integer());
-        *expr = optim::IRCopy(max_b);
+        *expr = ir::ir_utils::IRCopy(max_b);
       }
     }
   };
@@ -1773,7 +1773,7 @@ Expr ReplaceMaxToConstant(Expr expr) {
 
 Expr ConvertCasToCinn(Expr expr) {
   VLOG(7) << "Begin ConvertCasToCinn : " << expr;
-  Expr copied = optim::IRCopy(expr);
+  Expr copied = ir::ir_utils::IRCopy(expr);
 
   struct Mutator : ir::IRMutator<Expr*> {
     void operator()(Expr* expr) { Visit(expr); }
diff --git a/paddle/cinn/ir/schedule/ir_schedule.cc b/paddle/cinn/ir/schedule/ir_schedule.cc
index fab8a53deb121..b4067d1fbd05a 100644
--- a/paddle/cinn/ir/schedule/ir_schedule.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule.cc
@@ -189,7 +189,7 @@ std::vector<Expr> ScheduleImpl::Split(const Expr& loop,
     new_loop_vars.push_back(temp_var);
   }
   substitute_value = common::AutoSimplify(substitute_value);
-  Expr new_node = optim::IRCopy(for_node->body);
+  Expr new_node = ir::ir_utils::IRCopy(for_node->body);
   ReplaceExpr(&new_node, {for_node->loop_var}, {substitute_value});
   std::vector<Expr> splited_loops;
   splited_loops.resize(processed_factors.size());
@@ -252,7 +252,7 @@ Expr ScheduleImpl::Fuse(const std::vector<Expr>& loops) {
   }
   substitute_value[0] = fused_expr;
 
-  Expr fused_body = optim::IRCopy(for_nodes.back()->body);
+  Expr fused_body = ir::ir_utils::IRCopy(for_nodes.back()->body);
   ReplaceExpr(&fused_body, loop_vars, substitute_value);
   optim::Simplify(&fused_body);
   Expr fused_extent(1);
@@ -321,7 +321,7 @@ void ScheduleImpl::MutateForType(const Expr& loop,
       << "loop is not serial, current forloop type is "
       << static_cast<int>(for_node->for_type()) << ", and it cannot become "
       << static_cast<int>(for_type);
-  auto loop_copy = optim::IRCopy(loop);
+  auto loop_copy = ir::ir_utils::IRCopy(loop);
   auto* new_for_node = loop_copy.As<ir::For>();
   CHECK(new_for_node);
   new_for_node->set_for_type(for_type);
@@ -674,7 +674,7 @@ struct RfCreater : public ir::IRMutator<> {
     CHECK(root_realize);
     auto root_block = root_realize->schedule_block.As<ScheduleBlock>();
     CHECK(root_block);
-    Expr root_loop = optim::IRCopy(root_block->body);
+    Expr root_loop = ir::ir_utils::IRCopy(root_block->body);
     if (auto block = root_loop.As<Block>()) {
       CHECK_EQ(block->stmts.size(), 1U)
           << "rfactor root should only have one block stmt";
@@ -685,13 +685,13 @@ struct RfCreater : public ir::IRMutator<> {
     auto rf_for = rf_loop_.As<For>();
     CHECK(rf_for);
     // create new rfactor forloops
-    Expr new_rf_forloop = optim::IRCopy(root_loop);
+    Expr new_rf_forloop = ir::ir_utils::IRCopy(root_loop);
     RfMutator rf_mutator(rf_loop_, rf_axis_);
     rf_mutator(&new_rf_forloop);
     VLOG(3) << "After RfMutator, new rf_forloop is\n" << new_rf_forloop;
     auto new_rf_tensor = rf_mutator.GetNewRfTensor();
     // create final write-back forloops
-    Expr final_forloop = optim::IRCopy(root_loop);
+    Expr final_forloop = ir::ir_utils::IRCopy(root_loop);
     FinalMutator final_mutator(rf_loop_, rf_axis_, new_rf_tensor);
     final_mutator(&final_forloop);
     VLOG(3) << "After FinalMuator, final write-back forloop is\n"
@@ -721,7 +721,7 @@ struct CacheReadRewriter : public ir::IRMutator<> {
  public:
   static Expr Rewrite(const Expr& root, CacheBlockInfo* info) {
     CacheReadRewriter rewriter(root, info);
-    Expr new_root = optim::IRCopy(root);
+    Expr new_root = ir::ir_utils::IRCopy(root);
     rewriter(&new_root);
     return new_root;
   }
@@ -762,7 +762,7 @@ struct CacheWriteRewriter : public ir::IRMutator<> {
  public:
   static Expr Rewrite(const Expr& root, CacheBlockInfo* info) {
     CacheWriteRewriter rewriter(root, info);
-    Expr new_root = optim::IRCopy(root);
+    Expr new_root = ir::ir_utils::IRCopy(root);
     rewriter.mutate_cache_block = true;
     rewriter(&info->cache_block);
     rewriter.mutate_cache_block = false;
@@ -1194,7 +1194,7 @@ struct LoopReconstructor : public ir::IRMutator<> {
                             loop_.As<ir::For>()->device_api,
                             std::move(loop_body));
     }
-    new_loop_ = optim::IRCopy(loop_);
+    new_loop_ = ir::ir_utils::IRCopy(loop_);
 
     // Replace the copied Tensor object with the original Tensor object,
     // to ensure that the same Tensor in a AST is the same object.
@@ -1431,9 +1431,9 @@ void ScheduleImpl::SimpleComputeAt(const Expr& block, const Expr& loop) {
   }
 
   Expr result = loops.size() < block_loops.size()
-                    ? optim::IRCopy(block_loops[loops.size()])
-                    : optim::IRCopy(this_block);
-  Expr new_loop = optim::IRCopy(this_loop);
+                    ? ir::ir_utils::IRCopy(block_loops[loops.size()])
+                    : ir::ir_utils::IRCopy(this_block);
+  Expr new_loop = ir::ir_utils::IRCopy(this_loop);
 
   // Get the body of block_loop under the same loops
   auto body = block_loops.at(loops.size() - 1).As<ir::For>()->body;
@@ -1608,7 +1608,7 @@ void ComputeInliner::Visit(const ir::Load* expr, Expr* op) {
 Expr ComputeInliner::ReplaceInlinedTensor(Expr* load) {
   CHECK(load->As<ir::Load>());
   SetIndexSubstitution(load->As<ir::Load>()->indices);
-  Expr value_copy = optim::IRCopy(inlined_store_.As<Store>()->value);
+  Expr value_copy = ir::ir_utils::IRCopy(inlined_store_.As<Store>()->value);
   ReplaceExpr(&value_copy, idx_sub_var_, idx_sub_expr_);
   return value_copy;
 }
@@ -1684,7 +1684,7 @@ void ReverseComputeInliner::Visit(const ir::Store* expr, Expr* op) {
 Expr ReverseComputeInliner::ReplaceInlinedTensor(Expr* load) {
   CHECK(load->As<ir::Load>());
   SetIndexSubstitution(load->As<ir::Load>()->indices);
-  Expr value_copy = optim::IRCopy(inlined_store_.As<Store>()->value);
+  Expr value_copy = ir::ir_utils::IRCopy(inlined_store_.As<Store>()->value);
   return value_copy;
 }
 
@@ -1699,7 +1699,7 @@ Expr ReverseComputeInliner::ReplaceTargetTensor(Expr* store) {
     idx_sub_expr_.emplace_back(idx_vars_[i]);
   }
 
-  Expr value_copy = optim::IRCopy(target_store_);
+  Expr value_copy = ir::ir_utils::IRCopy(target_store_);
   ReplaceExpr(&value_copy, idx_sub_var_, idx_sub_expr_);
   return value_copy;
 }
@@ -1936,7 +1936,7 @@ void ScheduleImpl::Annotate(const Expr& block,
   CHECK(block.As<ir::ScheduleBlockRealize>());
   CHECK(block.As<ir::ScheduleBlockRealize>()
             ->schedule_block.As<ir::ScheduleBlock>());
-  auto copied_block = optim::IRCopy(block);
+  auto copied_block = ir::ir_utils::IRCopy(block);
   auto* schedule_block = copied_block.As<ir::ScheduleBlockRealize>()
                              ->schedule_block.As<ir::ScheduleBlock>();
   schedule_block->attrs.emplace(key, value);
@@ -2195,7 +2195,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   }
   CHECK(!used_target_loop_vars.empty());
   std::vector<Expr> used_target_loops;
-  auto expr_copy = optim::IRCopy(expr);
+  auto expr_copy = ir::ir_utils::IRCopy(expr);
   for (auto& var : used_target_loop_vars) {
     auto find_loop_var = ir::ir_utils::CollectIRNodesWithoutTensor(
         expr_copy,
@@ -2220,7 +2220,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
   VLOG(3) << "changed_loop_num is : " << changed_loop_num;
   VLOG(3) << "old_iter_values.size() is : " << old_iter_values.size();
   if (changed_loop_num >= static_cast<int>(old_iter_values.size())) {
-    new_loop = optim::IRCopy(block);
+    new_loop = ir::ir_utils::IRCopy(block);
     new_loop.As<ir::ScheduleBlockRealize>()->iter_values = new_iter_values;
   } else {
     CHECK(old_iter_values[changed_loop_num].as_var());
@@ -2234,7 +2234,7 @@ void ScheduleImpl::CopyTransformAndLoopInfo(const Expr& block,
         },
         true);
     CHECK_EQ(find_partial_loop.size(), 1U);
-    new_loop = optim::IRCopy(*find_partial_loop.begin());
+    new_loop = ir::ir_utils::IRCopy(*find_partial_loop.begin());
     auto find_schedule_block = ir::ir_utils::CollectIRNodesWithoutTensor(
         new_loop,
         [&](const Expr* x) { return x->As<ir::ScheduleBlockRealize>(); },
@@ -2332,13 +2332,14 @@ IRSchedule::IRSchedule(ir::ModuleExpr&& mod_expr,
 }
 
 IRSchedule::IRSchedule(const IRSchedule& other)
-    : impl_(std::make_unique<ScheduleImpl>(optim::IRCopy(other.GetModule()))),
+    : impl_(std::make_unique<ScheduleImpl>(
+          ir::ir_utils::IRCopy(other.GetModule()))),
       trace_(other.trace_) {
   this->InitSeed(other.ForkSeed());
 }
 
 IRSchedule& IRSchedule::operator=(const IRSchedule& src) {
-  impl_ = std::make_unique<ScheduleImpl>(optim::IRCopy(src.GetModule()));
+  impl_ = std::make_unique<ScheduleImpl>(ir::ir_utils::IRCopy(src.GetModule()));
   trace_ = src.trace_;
   this->InitSeed(src.ForkSeed());
   return *this;
diff --git a/paddle/cinn/ir/schedule/ir_schedule_util.cc b/paddle/cinn/ir/schedule/ir_schedule_util.cc
index 45779788e9c54..a4c9ef62b25f2 100644
--- a/paddle/cinn/ir/schedule/ir_schedule_util.cc
+++ b/paddle/cinn/ir/schedule/ir_schedule_util.cc
@@ -348,8 +348,8 @@ IterRange GetAccessedRange(const Expr& index,
     var_maxs.emplace_back(range.min + range.extent - 1);
   }
 
-  Expr indice_min = optim::IRCopy(index);
-  Expr indice_max = optim::IRCopy(index);
+  Expr indice_min = ir::ir_utils::IRCopy(index);
+  Expr indice_max = ir::ir_utils::IRCopy(index);
   // replace the var by the corresponding iter_value
   ReplaceExpr(&indice_min, iter_vars, var_mins);
   ReplaceExpr(&indice_max, iter_vars, var_maxs);
@@ -408,7 +408,7 @@ std::vector<IterRange> CalculateTensorRegions(
 
   std::vector<IterRange> result;
   for (int i = 0; i < tensor_indices.size(); ++i) {
-    Expr binded_index = optim::IRCopy(tensor_indices[i]);
+    Expr binded_index = ir::ir_utils::IRCopy(tensor_indices[i]);
     ReplaceExpr(&binded_index, iter_vars, iter_values);
     auto range = GetAccessedRange(binded_index, loop_vars, loop_ranges);
 
@@ -656,7 +656,7 @@ Expr ConstructOtherStmtChain(const std::vector<Expr>& stmts,
                              const std::vector<int> reordered_indices) {
   Expr new_loop;
   for (int i = reordered_indices.size() - 1; i >= 0; --i) {
-    Expr temp = optim::IRCopy(loops[reordered_indices[i]]);
+    Expr temp = ir::ir_utils::IRCopy(loops[reordered_indices[i]]);
     CHECK(temp.defined());
     CHECK(temp.As<ir::For>());
     if (new_loop.defined()) {
@@ -695,10 +695,10 @@ Expr ConstructNewLoopChain(const std::vector<Expr>& chain,
     Expr temp;
     if (loop_set.count(loop_in_chain)) {
       CHECK_GE(index, 0);
-      temp = optim::IRCopy(ordered_loops[index]);
+      temp = ir::ir_utils::IRCopy(ordered_loops[index]);
       --index;
     } else {
-      temp = optim::IRCopy(loop_in_chain);
+      temp = ir::ir_utils::IRCopy(loop_in_chain);
     }
     CHECK(temp.defined());
     CHECK(temp.As<ir::For>());
@@ -1029,9 +1029,9 @@ std::vector<IterRange> CalculateRequiredRegions(
     for (const Expr& req_block : required_blocks) {
       CHECK(req_block.As<ir::ScheduleBlockRealize>());
       Expr block_body =
-          optim::IRCopy(req_block.As<ir::ScheduleBlockRealize>()
-                            ->schedule_block.As<ir::ScheduleBlock>()
-                            ->body);
+          ir::ir_utils::IRCopy(req_block.As<ir::ScheduleBlockRealize>()
+                                   ->schedule_block.As<ir::ScheduleBlock>()
+                                   ->body);
       auto iter_vars = req_block.As<ir::ScheduleBlockRealize>()
                            ->schedule_block.As<ir::ScheduleBlock>()
                            ->iter_vars;
diff --git a/paddle/cinn/ir/test/ir_copy_test.cc b/paddle/cinn/ir/test/ir_copy_test.cc
index cd3199d4947dd..570e1b12aa213 100644
--- a/paddle/cinn/ir/test/ir_copy_test.cc
+++ b/paddle/cinn/ir/test/ir_copy_test.cc
@@ -19,13 +19,14 @@
 #include "paddle/cinn/ir/utils/ir_printer.h"
 
 namespace cinn {
-namespace optim {
+namespace ir {
+namespace ir_utils {
 
 TEST(IrCopy, basic) {
   Expr a(1.f);
   auto aa = IRCopy(a);
   LOG(INFO) << "aa " << aa;
 }
-
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/test/schedule_desc_test.cc b/paddle/cinn/ir/test/schedule_desc_test.cc
index 66a01b14b08d7..a798101813868 100644
--- a/paddle/cinn/ir/test/schedule_desc_test.cc
+++ b/paddle/cinn/ir/test/schedule_desc_test.cc
@@ -95,7 +95,7 @@ std::vector<ir::LoweredFunc> LowerCompute(
 IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs) {
   std::vector<Expr> exprs;
   for (auto&& func : lowered_funcs) {
-    exprs.emplace_back(optim::IRCopy(func->body));
+    exprs.emplace_back(ir::ir_utils::IRCopy(func->body));
   }
   return ir::IRSchedule(ir::ModuleExpr(exprs));
 }
@@ -106,10 +106,11 @@ std::string SourceCodeGen(const ModuleExpr& module_expr,
                           const Target& target) {
   auto exprs = module_expr.GetExprs();
   CHECK_EQ(exprs.size(), lowered_funcs.size()) << "size of func is not euqal";
-  std::vector<ir::LoweredFunc> updated_funcs = optim::IRCopy(lowered_funcs);
+  std::vector<ir::LoweredFunc> updated_funcs =
+      ir::ir_utils::IRCopy(lowered_funcs);
   Module::Builder builder("test_module", target);
   for (auto i = 0; i < lowered_funcs.size(); ++i) {
-    updated_funcs[i]->body = optim::IRCopy(exprs.at(i));
+    updated_funcs[i]->body = ir::ir_utils::IRCopy(exprs.at(i));
     builder.AddFunction(updated_funcs[i]);
   }
   auto module = builder.Build();
@@ -839,12 +840,14 @@ TEST_F(TestScheduleDesc, StepKind_MergeExprs) {
   auto funcs_1 =
       LowerCompute({32, 32, 32}, target, true, "elementwise-add_const");
 
-  ir::IRSchedule ir_sch = ir::IRSchedule(ir::ModuleExpr(
-      {optim::IRCopy(funcs_0[0]->body), optim::IRCopy(funcs_0[0]->body)}));
+  ir::IRSchedule ir_sch =
+      ir::IRSchedule(ir::ModuleExpr({ir::ir_utils::IRCopy(funcs_0[0]->body),
+                                     ir::ir_utils::IRCopy(funcs_0[0]->body)}));
   ir_sch.MergeExprs();
   trace.Append(ScheduleDesc::Step("MergeExprs", {}, {}, {}));
-  ir::IRSchedule replay_sch = ir::IRSchedule(ir::ModuleExpr(
-      {optim::IRCopy(funcs_0[0]->body), optim::IRCopy(funcs_0[0]->body)}));
+  ir::IRSchedule replay_sch =
+      ir::IRSchedule(ir::ModuleExpr({ir::ir_utils::IRCopy(funcs_0[0]->body),
+                                     ir::ir_utils::IRCopy(funcs_0[0]->body)}));
   trace.Replay(&replay_sch);
 
   auto lhs_exprs = ir_sch.GetModule().GetExprs();
diff --git a/paddle/cinn/ir/utils/ir_copy.cc b/paddle/cinn/ir/utils/ir_copy.cc
index 22d7c99bcd322..b157f6030a5e6 100644
--- a/paddle/cinn/ir/utils/ir_copy.cc
+++ b/paddle/cinn/ir/utils/ir_copy.cc
@@ -27,9 +27,9 @@
 #include "paddle/cinn/ir/utils/ir_printer.h"
 
 namespace cinn {
-namespace optim {
-using namespace ir;  // NOLINT
-
+namespace ir {
+namespace ir_utils {
+namespace {
 struct IRCopyVisitor : public ir::IRVisitorRequireReImpl<Expr> {
   // Use maps to unify all the copied tensors and buffers.
   std::map<std::string, ir::_Tensor_*> tensor_map;
@@ -474,7 +474,7 @@ Expr IRCopyVisitor::Visit(const ir::intrinsics::BuiltinIntrin* op) {
   return intrinsics::BuiltinIntrin::Make(
       op->name, op->args, op->id, op->arg_nums, op->type());
 }
-
+}  // namespace
 Expr IRCopy(Expr x) {
   IRCopyVisitor visitor;
   auto copied = visitor.Visit(&x);
@@ -507,6 +507,6 @@ std::vector<ir::LoweredFunc> IRCopy(const std::vector<ir::LoweredFunc>& x) {
   }
   return res;
 }
-
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_copy.h b/paddle/cinn/ir/utils/ir_copy.h
index 726739394eba6..594f07e91cfa0 100644
--- a/paddle/cinn/ir/utils/ir_copy.h
+++ b/paddle/cinn/ir/utils/ir_copy.h
@@ -24,9 +24,8 @@ namespace cinn {
 
 namespace ir {
 class ModuleExpr;
-}  // namespace ir
 
-namespace optim {
+namespace ir_utils {
 
 //! Shallow copy an expression.
 Expr IRCopy(Expr x);
@@ -39,5 +38,6 @@ ir::LoweredFunc IRCopy(const ir::LoweredFunc& x);
 
 std::vector<ir::LoweredFunc> IRCopy(const std::vector<ir::LoweredFunc>& x);
 
-}  // namespace optim
+}  // namespace ir_utils
+}  // namespace ir
 }  // namespace cinn
diff --git a/paddle/cinn/ir/utils/ir_replace.cc b/paddle/cinn/ir/utils/ir_replace.cc
index da2305359c5e9..a1b3138291023 100644
--- a/paddle/cinn/ir/utils/ir_replace.cc
+++ b/paddle/cinn/ir/utils/ir_replace.cc
@@ -43,14 +43,14 @@ struct IrReplaceMutator : ir::IRMutator<Expr*> {
   void Visit(const ir::_Var_* op, Expr* expr) override {
     if (op->node_type() == from_->node_type() &&
         from_repr_ == GetStreamCnt(*expr)) {
-      *expr = optim::IRCopy(to_);
+      *expr = ir::ir_utils::IRCopy(to_);
     }
   }
 
   void Visit(const ir::Broadcast* op, Expr* expr) override {
     if (op->node_type() == from_->node_type() &&
         from_repr_ == GetStreamCnt(*expr)) {
-      *expr = optim::IRCopy(to_);
+      *expr = ir::ir_utils::IRCopy(to_);
     }
   }
 
diff --git a/paddle/cinn/optim/compute_inline_expand.cc b/paddle/cinn/optim/compute_inline_expand.cc
index d4123729bc53f..f9563449fb128 100644
--- a/paddle/cinn/optim/compute_inline_expand.cc
+++ b/paddle/cinn/optim/compute_inline_expand.cc
@@ -150,7 +150,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
         }
         ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
         for (int i = 0; i < node->indices.size(); i++) {
-          auto temp = optim::IRCopy(node->indices[i]);
+          auto temp = ir::ir_utils::IRCopy(node->indices[i]);
           ir::IRMutator<>::Visit(&temp, &temp);
           node->indices[i] = temp;
         }
@@ -159,7 +159,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
       } else {
         ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
         for (int i = 0; i < node->indices.size(); i++) {
-          auto temp = optim::IRCopy(node->indices[i]);
+          auto temp = ir::ir_utils::IRCopy(node->indices[i]);
           ir::IRMutator<>::Visit(&temp, &temp);
           node->indices[i] = temp;
         }
@@ -167,7 +167,7 @@ struct TensorInlineExpandMutator : public ir::IRMutator<> {
     } else {
       ir::IRMutator<>::Visit(&node->tensor, &node->tensor);
       for (int i = 0; i < node->indices.size(); i++) {
-        auto temp = optim::IRCopy(node->indices[i]);
+        auto temp = ir::ir_utils::IRCopy(node->indices[i]);
         ir::IRMutator<>::Visit(&temp, &temp);
         node->indices[i] = temp;
       }
diff --git a/paddle/cinn/optim/optimize.cc b/paddle/cinn/optim/optimize.cc
index 3764e1bd616e2..d38bc59c058ad 100644
--- a/paddle/cinn/optim/optimize.cc
+++ b/paddle/cinn/optim/optimize.cc
@@ -42,7 +42,7 @@ Expr Optimize(Expr e,
               bool runtime_debug_info,
               bool remove_gpu_for_loops) {
   CHECK(e.defined());
-  auto copied = IRCopy(e);
+  auto copied = ir::ir_utils::IRCopy(e);
 
   FoldCINNCallArguments(&copied);
   TransformPolyForToFor(&copied);
@@ -84,7 +84,7 @@ Expr Optimize(Expr e,
 }
 
 ir::Module Optimize(const ir::Module& module, const Target& target) {
-  auto copied = IRCopy(Expr(module));
+  auto copied = ir::ir_utils::IRCopy(Expr(module));
   UnrollLoop(&copied);
   VectorizeLoops(&copied, Target());
   VLOG(10) << "After VectorizeLoops:" << copied.as_module_ref();
diff --git a/paddle/cinn/optim/replace_call_with_expr.cc b/paddle/cinn/optim/replace_call_with_expr.cc
index d63210d1d28f1..62103aa341b59 100644
--- a/paddle/cinn/optim/replace_call_with_expr.cc
+++ b/paddle/cinn/optim/replace_call_with_expr.cc
@@ -36,7 +36,7 @@ struct ReplaceCallWithExprModifier : public ir::IRMutator<> {
     VLOG(3) << "Processing Call node " << *op;
     if (statement_ != node->name) return;
 
-    Expr expr_candidate = IRCopy(candidate_);
+    Expr expr_candidate = ir::ir_utils::IRCopy(candidate_);
     VLOG(3) << "Original candidate expr: " << candidate_;
     VLOG(3) << "Copied candidate expr: " << expr_candidate;
 
@@ -62,7 +62,7 @@ void ReplaceIslCallWithExpr(Expr *e,
                             const Expr &candidate,
                             const std::map<std::string, Expr> &axis_map) {
   VLOG(3) << "ReplaceCallWithExpr, original expression: " << candidate;
-  Expr copied = IRCopy(candidate);
+  Expr copied = ir::ir_utils::IRCopy(candidate);
   // update the axis in the copied expression.
 
   // we treat the Store node as the normal statement, the others like Call node
diff --git a/paddle/cinn/optim/replace_var_with_expr.cc b/paddle/cinn/optim/replace_var_with_expr.cc
index 57ab47d7c0250..7de44e3470e66 100644
--- a/paddle/cinn/optim/replace_var_with_expr.cc
+++ b/paddle/cinn/optim/replace_var_with_expr.cc
@@ -41,7 +41,7 @@ struct ReplaceVarWithExprMutator : public ir::IRMutator<> {
  private:
   void Visit(const ir::_Var_* expr, Expr* op) override {
     if (expr->name == var_->name && (do_replace_ || visit_all_)) {
-      auto copied = IRCopy(expr_);
+      auto copied = ir::ir_utils::IRCopy(expr_);
       *op = copied;
     }
   }
diff --git a/paddle/cinn/optim/transform_gpu_forloop.cc b/paddle/cinn/optim/transform_gpu_forloop.cc
index 7b30f75bf9652..a62e24c539e5f 100644
--- a/paddle/cinn/optim/transform_gpu_forloop.cc
+++ b/paddle/cinn/optim/transform_gpu_forloop.cc
@@ -185,7 +185,7 @@ class RestructureVarNodes : public ir::IRMutator<> {
   void Visit(const ir::Load *load, Expr *op) override {
     std::vector<ir::Expr> indices_copied;
     for (const ir::Expr &indice : load->indices) {
-      indices_copied.push_back(IRCopy(indice));
+      indices_copied.push_back(ir::ir_utils::IRCopy(indice));
     }
     op->As<ir::Load>()->indices = indices_copied;
 
@@ -195,7 +195,7 @@ class RestructureVarNodes : public ir::IRMutator<> {
   void Visit(const ir::Store *store, Expr *op) override {
     std::vector<ir::Expr> indices_copied;
     for (const ir::Expr &indice : store->indices) {
-      indices_copied.push_back(IRCopy(indice));
+      indices_copied.push_back(ir::ir_utils::IRCopy(indice));
     }
     op->As<ir::Store>()->indices = indices_copied;
 
@@ -585,7 +585,7 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> {
   }
 
   int BufferSize(ir::Expr indice) {
-    auto copy = IRCopy(indice);
+    auto copy = ir::ir_utils::IRCopy(indice);
     auto vars = ir::ir_utils::CollectIRNodesInOrder(
         copy, [](const ir::Expr *expr) { return expr->As<ir::_Var_>(); });
 
@@ -598,7 +598,7 @@ class ResizeBufferSizeVisitor : public ir::IRMutator<> {
       auto extent = loop_2_extent_.find(var->name)->second;
 
       for (int idx = 0; idx < extent; ++idx) {
-        auto tmp = IRCopy(index);
+        auto tmp = ir::ir_utils::IRCopy(index);
         ReplaceVarWithExpr(&tmp, var, Expr(idx));
 
         if (deep == vars.size() - 1) {
diff --git a/paddle/cinn/optim/unroll_loops.cc b/paddle/cinn/optim/unroll_loops.cc
index 32d4037b83e3e..1131eb68d4d1b 100644
--- a/paddle/cinn/optim/unroll_loops.cc
+++ b/paddle/cinn/optim/unroll_loops.cc
@@ -94,7 +94,7 @@ struct UnrollMutator : public ir::IRMutator<Expr*> {
 
     for (int i = min->value; i < extent->value; i++) {
       Expr start = op->min + i;
-      body.push_back(optim::IRCopy(op->body));
+      body.push_back(ir::ir_utils::IRCopy(op->body));
       cinn::ir::ir_utils::IrReplace(&body.back(), op->loop_var, start);
     }
 
diff --git a/paddle/cinn/optim/vectorize_loops.cc b/paddle/cinn/optim/vectorize_loops.cc
index 8ed13e9d5971b..4903a1466b98d 100644
--- a/paddle/cinn/optim/vectorize_loops.cc
+++ b/paddle/cinn/optim/vectorize_loops.cc
@@ -148,11 +148,11 @@ class TensorVectorizeTeller : public ir::IRMutator<const Expr *> {
     }
 
     // check tensor accessed sequentially by comparing index one by one
-    Expr first_idx = optim::IRCopy(indices.back());
+    Expr first_idx = ir::ir_utils::IRCopy(indices.back());
     cinn::ir::ir_utils::IrReplace(&first_idx, Expr(iter_var_), Expr(0));
     const auto &interval = var_intervals_->at(iter_var_->name);
     for (int i = 1; i < interval.r; ++i) {
-      Expr next_idx = optim::IRCopy(indices.back());
+      Expr next_idx = ir::ir_utils::IRCopy(indices.back());
       cinn::ir::ir_utils::IrReplace(&next_idx, Expr(iter_var_), Expr(i));
       auto gap = common::AutoSimplify(Expr(next_idx - first_idx));
       if (!gap.As<IntImm>() || gap.as_int32() != i) {
@@ -800,7 +800,7 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
         cuda_vectorizer.Visit(&new_forloop->body);
         // unroll the new forloop to compute each element of the vector
         // iteratively
-        auto copied_loop = optim::IRCopy(_new_forloop);
+        auto copied_loop = ir::ir_utils::IRCopy(_new_forloop);
         copied_loop.As<ir::For>()->set_unrolled();
         optim::UnrollLoop(&copied_loop);
         // add cast exprs of vector type in the front of vectorized forloop,
@@ -883,12 +883,13 @@ struct VectorizeLoops_ : public IRMutator<Expr *> {
           Var new_iterator_outer(
               common::UniqName(outer_for->loop_var->name + "_s"));
 
-          Expr inner_for_b = Block::Make({For::Make(new_iterator_inner,
-                                                    inner_for->min,
-                                                    b,
-                                                    ForType::Serial,
-                                                    DeviceAPI::UNK,
-                                                    IRCopy(inner_for->body))});
+          Expr inner_for_b =
+              Block::Make({For::Make(new_iterator_inner,
+                                     inner_for->min,
+                                     b,
+                                     ForType::Serial,
+                                     DeviceAPI::UNK,
+                                     ir::ir_utils::IRCopy(inner_for->body))});
           cinn::ir::ir_utils::IrReplace(
               &inner_for_b, inner_for->loop_var, Expr(new_iterator_inner));
 
diff --git a/paddle/cinn/poly/stage.cc b/paddle/cinn/poly/stage.cc
index d74bce1404e5b..01fa3bdb38fd9 100644
--- a/paddle/cinn/poly/stage.cc
+++ b/paddle/cinn/poly/stage.cc
@@ -515,7 +515,7 @@ void Stage::EditTempTensor(Stage *other, int level) {
 
   std::vector<Expr> new_shape;
   for (auto &i : this->tensor()->new_indices) {
-    new_shape.push_back(optim::IRCopy(i));
+    new_shape.push_back(ir::ir_utils::IRCopy(i));
   }
   for (auto &i : new_shape) {
     for (auto &j : dim_to_range) {
diff --git a/paddle/cinn/pybind/optim.cc b/paddle/cinn/pybind/optim.cc
index 00219477e8f85..bb1a18a2c24fe 100755
--- a/paddle/cinn/pybind/optim.cc
+++ b/paddle/cinn/pybind/optim.cc
@@ -36,13 +36,13 @@ void BindSimplify(py::module* m) {
   m->def(
       "simplify",
       [](const Expr& expr) -> Expr {
-        auto copied = optim::IRCopy(expr);
+        auto copied = ir::ir_utils::IRCopy(expr);
         Simplify(&copied);
         return copied;
       },
       py::arg("expr"));
 
-  m->def("ir_copy", py::overload_cast<Expr>(&optim::IRCopy));
+  m->def("ir_copy", py::overload_cast<Expr>(&ir::ir_utils::IRCopy));
 }
 
 }  // namespace